In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import pandas as pd

data = pd.read_csv("date.csv")
print(data["formal"].value_counts())

formal
1400-01-01    6
1407-05-03    6
1407-04-21    6
1407-04-22    6
1407-04-23    6
             ..
1403-09-03    6
1403-09-04    6
1403-09-05    6
1403-09-06    6
1410-12-29    6
Name: count, Length: 4015, dtype: int64


In [3]:
train_portion = int(0.8 * (len(data)))
test_portion = int(0.1 * (len(data)))
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion: train_portion + test_portion]
val_data = data[train_portion+test_portion :]


In [4]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 19272
Validation set length: 2409
Test set length: 2409


In [5]:
train_data[:5]

Unnamed: 0,informal,formal
0,اول فروردین هزار و چهار صد و,1400-01-01
1,روز 1 فروردین هزار و چهار صد و,1400-01-01
2,1 فروردین هزار و چهار صد و,1400-01-01
3,2 فروردین هزار و چهار صد و,1400-01-02
4,روز 2 فروردین هزار و چهار صد و,1400-01-02


In [5]:
import torch
import torch.nn
from torch.utils.data import Dataset
import random

class DateData(Dataset):
    def __init__(self,data,tokenizer):
        self.tokenizer = tokenizer
        self.mask_prob = 0.2
        self.pad_token_id = 6
        self.mask_token_id = 27
        self.data = data
        self.encoded_data = []
        self.informal = []
        self.formal = []
        for _,row in data.iterrows():
            self.informal.append(row["informal"])
            self.formal.append(row["formal"])
        
        

    def __getitem__(self, idx):
            input_ids = torch.tensor(self.tokenizer.encode(
                 self.informal[idx] +" "+"[MASK][MASK][MASK][MASK]-[MASK][MASK]-[MASK][MASK]"))
            
            labels = torch.tensor(self.tokenizer.encode(self.informal[idx] +" "+ self.formal[idx]))
            

                    
            return input_ids, labels
    
    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_data:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length


In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./persianTokenizer")
pecial_tokens_dict = {'additional_special_tokens': ["[<year>]","[<month>]","[<day>]"]}
tokenizer.add_special_tokens(pecial_tokens_dict)

  from .autonotebook import tqdm as notebook_tqdm


3

In [7]:
train_dataset = DateData(train_data,tokenizer)
test_dataset = DateData(test_data,tokenizer)

In [8]:
class Transformer(torch.nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, dropout=0.1,device="cuda"):
        super(Transformer, self).__init__()

        self.embedding = torch.nn.Embedding(output_dim, model_dim)
        self.positional_encoding = torch.nn.Embedding(input_dim, model_dim)
        self.en = torch.nn.TransformerEncoder(
            torch.nn.TransformerEncoderLayer(d_model=model_dim,nhead=num_heads,dim_feedforward= model_dim * 2,
                                         activation = torch.nn.functional.gelu,
                                         batch_first=True,bias=False,dropout=0.1,device=device),
                                           num_layers=num_layers,enable_nested_tensor=False)
        self.fc_train = torch.nn.Linear(model_dim, output_dim)
        self.fc_year = torch.nn.Linear(model_dim, 10)
        self.fc_day = torch.nn.Linear(model_dim, 31)
        self.fc_month = torch.nn.Linear(model_dim, 12)

    def forward(self, x):
        batch_size, seq_len = x.shape
        tok_embed = self.embedding(x)
        pos_embed = self.positional_encoding(torch.arange(seq_len, device=x.device))
        x = tok_embed + pos_embed
        x = self.en(x)
        
        return self.fc_train(x)
    def forward_year(self, x):
        batch_size, seq_len = x.shape
        tok_embed = self.embedding(x)
        pos_embed = self.positional_encoding(torch.arange(seq_len, device=x.device))
        x = tok_embed + pos_embed
        x = self.en(x)
        
        return self.fc_year(x[:,0,:])
    
    def forward_month(self, x):
        batch_size, seq_len = x.shape
        tok_embed = self.embedding(x)
        pos_embed = self.positional_encoding(torch.arange(seq_len, device=x.device))
        x = tok_embed + pos_embed
        x = self.en(x)
        
        return self.fc_month(x[:,1,:])
    
    def forward_day(self, x):
        batch_size, seq_len = x.shape
        tok_embed = self.embedding(x)
        pos_embed = self.positional_encoding(torch.arange(seq_len, device=x.device))
        x = tok_embed + pos_embed
        x = self.en(x)
        
        return self.fc_day(x[:,2,:])
    
    

In [9]:
def custom_collate_fn(
    batch,
    pad_token_id=6,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    
    batch_max_length = max(item[0].shape[-1] + 1 for item in batch)
    

    
    inputs_lst, targets_lst = [], []

    for item in batch:
        
        new_item = item[0].numpy().tolist()
       
        new_item += [pad_token_id]
       
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        new_item1 = item[1].numpy().tolist()
       
        new_item1 += [pad_token_id]
        
        padded1 = (
            new_item1 + [pad_token_id] *
            (batch_max_length - len(new_item1))
        )
        inputs = torch.tensor(padded)  
        targets = torch.tensor(padded1)  

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

      
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

   
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [10]:
from functools import partial

customized_collate_fn = partial(
    custom_collate_fn,
    device="cpu",
    allowed_max_length=1024
)

In [51]:
from torch.utils.data import DataLoader
batch_size = 2048
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,drop_last=True,collate_fn=customized_collate_fn)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,drop_last=True,collate_fn=customized_collate_fn)

In [12]:
for inputs,targets in train_loader:
    print(inputs.shape)
    print(targets.shape)
    

torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size([512, 25])
torch.Size

In [13]:
DATE_CONFIG = {
    "vocab_size" : 25003,
    "context_length" : 32,
    "emb_dim" : 256,
    "n_heads" : 4,
    "n_layers" : 4,
    "drop_rate" : 0.1
}

In [14]:
model = Transformer(
    input_dim=DATE_CONFIG["context_length"],
    model_dim=DATE_CONFIG["emb_dim"],
    num_heads=DATE_CONFIG["n_heads"],
    num_layers=DATE_CONFIG["n_layers"],
    output_dim= DATE_CONFIG["vocab_size"],
)
model.to("cuda")

Transformer(
  (embedding): Embedding(25003, 256)
  (positional_encoding): Embedding(32, 256)
  (en): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=False)
        )
        (linear1): Linear(in_features=256, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=False)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_train): Linear(in_features=256, out_features=25003, bias=True)
  (fc_year): Linear(in_features=256, out_features=10, bias=True)
  (fc_day): Linear(in_features=256, out_features=31, bias=True)
  (fc

In [25]:
model.load_state_dict(torch.load("bertV12.pth"))
model.to("cuda")

  model.load_state_dict(torch.load("bertV2.pth"))


Transformer(
  (embedding): Embedding(25003, 256)
  (positional_encoding): Embedding(32, 256)
  (en): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=False)
        )
        (linear1): Linear(in_features=256, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=False)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_train): Linear(in_features=256, out_features=25003, bias=True)
  (fc_year): Linear(in_features=256, out_features=10, bias=True)
  (fc_day): Linear(in_features=256, out_features=31, bias=True)
  (fc

In [37]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 14,947,552


In [15]:
def calc_loss_batch(input_batch,target_batch,model,device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

In [16]:

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [17]:
model.to("cuda")
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,model,device="cuda")

print("Train loss =", train_loss)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Train loss = 10.321034199482686


In [99]:
model.to("cuda")
with torch.no_grad():
    test_loss = calc_loss_loader(test_loader,model,device="cuda")

print("Test loss =", test_loss)

Test loss = 0.010966735891997814


In [19]:
def calculate_loss_test(model,test_loader):
    model.to("cuda")
    model.eval()
    with torch.no_grad():
        test_loss = calc_loss_loader(test_loader,model,device="cuda")

    print("Test loss =", test_loss)
    model.train()

In [53]:
from torch.optim.lr_scheduler import CosineAnnealingLR
# lr=0.000005
epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005,weight_decay=0.1)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_loader)*epochs)
# optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()
for epoch in range(epochs):
    total_loss = 0.
    for inputs,targets in train_loader:
        optimizer.zero_grad()
        loss = calc_loss_batch(inputs,targets,model,"cuda")
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch} , total epoch loss {total_loss / len(train_loader)}")
    calculate_loss_test(model,test_loader)
    

Epoch 0 , total epoch loss 0.005016966878126065
Test loss = 0.010966735891997814


In [21]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [87]:
model.eval()
masked_prompt , prompt = train_dataset[1]
test_masked_prompt , test_prompt = test_dataset[219]
masked_prompt = masked_prompt.unsqueeze(0)
test_masked_prompt = test_masked_prompt.unsqueeze(0)

with torch.no_grad():
    masked_prompt = masked_prompt.to("cuda")
    test_masked_prompt = test_masked_prompt.to("cuda")
    logits = model(masked_prompt)
    logits_test = model(test_masked_prompt)
    # logits = logits[:,-1,:]
    logits = logits.flatten(0, 1)
    logits_test = logits_test.flatten(0, 1)
    probs = torch.argmax(logits,dim=-1,keepdim=True)
    probs_test = torch.argmax(logits_test,dim=-1,keepdim=True)
    token_ids = probs.squeeze(1)
    token_ids_test = probs_test.squeeze(1)
    

print("Masked prompt ",token_ids_to_text(masked_prompt,tokenizer))
print("Model OutPut ",token_ids_to_text(token_ids,tokenizer))
print("Real output",token_ids_to_text(prompt,tokenizer))
print("Test Masked prompt ",token_ids_to_text(test_masked_prompt,tokenizer))
print("Model OutPut ",token_ids_to_text(token_ids_test,tokenizer))
print("Real output",token_ids_to_text(test_prompt,tokenizer))

Masked prompt  [CLS] روز 1 فروردین هزار و چهار صد و [MASK][MASK][MASK][MASK]-[MASK][MASK]-[MASK][MASK][SEP]
Model OutPut  [CLS] روز 1 فروردین هزار و چهار صد و 1400-11-01[SEP]
Real output [CLS] روز 1 فروردین هزار و چهار صد و 1400-01-01[SEP]
Test Masked prompt  [CLS] روز 17 دی 1406 [MASK][MASK][MASK][MASK]-[MASK][MASK]-[MASK][MASK][SEP]
Model OutPut  [CLS] روز 17 دی 1406 1406-10-17[SEP]
Real output [CLS] روز 17 دی 1406 1406-10-17[SEP]


In [107]:
def predict_masked(model,tokenizer,input,deivce):
    model.eval()
    inputs_masked = input + " " + "[MASK][MASK][MASK][MASK]-[MASK][MASK]-[MASK][MASK]"
    input_ids = tokenizer.encode(inputs_masked)
    input_ids = torch.tensor(input_ids).to(deivce)
    with torch.no_grad():
        logits = model(input_ids.unsqueeze(0))
        logits = logits.flatten(0, 1)
        probs = torch.argmax(logits,dim=-1,keepdim=True)
        token_ids = probs.squeeze(1)
        answer_ids = token_ids[-11:-1]
    return token_ids_to_text(answer_ids,tokenizer)

In [111]:
predict_masked(model,tokenizer,"۱۲ اردیبهشت 1402","cuda")

'1402-02-10'

In [57]:
torch.save(model.state_dict(),"bertV11.pth")

## Calculate Accuracy

In [98]:
model.eval()
T = 0
F = 0
for masked_prompt,prompt in test_dataset:
    masked_prompt = masked_prompt.unsqueeze(0)
    with torch.no_grad():
        masked_prompt, prompt = masked_prompt.to("cuda"), prompt.to("cuda")
        logits = model(masked_prompt)
        logits = logits.flatten(0, 1)
        probs = torch.argmax(logits,dim=-1,keepdim=True)
        token_ids = probs.squeeze(1)
        if torch.equal(prompt,token_ids):
            T += 1
        else:
            F += 1
        
print("Accuracy = ", T / (T + F))


Accuracy =  0.6687422166874222


In [101]:
perplexity = torch.exp(torch.tensor(test_loss))
print(f"Perplexity on Test data = {perplexity}")

Perplexity on Test data = 1.0110270977020264
