In [4]:
import nbformat
from IPython.core.interactiveshell import InteractiveShell
def import_notebook(nb_path):
    with open(nb_path, 'r') as f:
        nb = nbformat.read(f, as_version=4)
    shell = InteractiveShell.instance()
    for cell in nb.cells:
        if cell.cell_type == 'code':
            shell.run_cell(cell.source)

In [5]:
import_notebook('model1.ipynb')

Test passed!


In [6]:
import torch
import torch.nn as nn
from pathlib import Path
import warnings
import os
from tqdm import tqdm

In [7]:
import torchtext.datasets as datasets
from torch.utils.data import Dataset,DataLoader,random_split
from torch.optim.lr_scheduler import LambdaLR

In [8]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [9]:
def casual_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0
class BilingualDataset(Dataset):
    def __init__(self,ds,input_token,output_token,input_lang,output_lang,seq_lenght):
        super().__init__()
        self.seq_len=seq_lenght
        self.raw_data=ds
        self.input_token_list=input_token
        self.input_lang=input_lang
        self.output_token_list=output_token
        self.output_lang=output_lang
        self.sos_id=torch.tensor([output_token.token_to_id("[SOS]")],dtype=torch.int64)
        self.eos_id=torch.tensor([output_token.token_to_id("[EOS]")],dtype=torch.int64)
        self.pad_id=torch.tensor([output_token.token_to_id("[PAD]")],dtype=torch.int64)
    def __len__(self):
        return len(self.raw_data)
    def __getitem__(self,id):
            
        input_output=self.raw_data[id]

            #dividing the text into input and output
        input_text=input_output['translation'][self.input_lang]
        output_text=input_output['translation'][self.output_lang]
            ### converting text into list to token
        input_sen_token=self.input_token_list.encode(input_text).ids
        output_sen_token=self.output_token_list.encode(output_text).ids
            ### adding padding[PAD] , endofdentense[EOS],startofsentense[SOS] to the sentnse to
            ### to make it equal to seqence length
        num_encod_padding=self.seq_len-len(input_sen_token)-2 #2==[SOS]&[EOS]
        num_decod_padding=self.seq_len-len(output_sen_token)-1 #1==[SOS]
        num_label_padding=self.seq_len-len(output_sen_token)-1 ##1==[EOS]
        if  num_encod_padding< 0 or num_decod_padding < 0:
            raise ValueError("Sentence is too long")
            ### forming the whole sentence of encoding and decoding input
        encoding_input=torch.cat([self.sos_id,torch.tensor(input_sen_token,dtype=torch.int64),
                                     self.eos_id,
                                     torch.tensor([self.pad_id]*num_encod_padding,dtype=torch.int64)
                                     ],dim=0)
        decoding_input=torch.cat([self.sos_id,torch.tensor(output_sen_token,dtype=torch.int64),
                                     
                                     torch.tensor([self.pad_id]*num_decod_padding,dtype=torch.int64)
                                     ],dim=0)
        label_input=torch.cat([torch.tensor(output_sen_token,dtype=torch.int64),
                                     self.eos_id,
                                     torch.tensor([self.pad_id]*num_label_padding,dtype=torch.int64)
                                     ],dim=0)
        assert encoding_input.size(0)==self.seq_len
        assert decoding_input.size(0)==self.seq_len
        assert label_input.size(0)==self.seq_len
        return {
                "encoding_input":encoding_input,
                "decoding_input":decoding_input,
                "encodig_mask":(encoding_input!=self.pad_id).unsqueeze(0).unsqueeze(0).int(),     # (1, 1, seq_len)
                "decoder_mask": (decoding_input != self.pad_id).unsqueeze(0).int() & casual_mask(decoding_input.size(0)),
                "label": label_input,
                "encoding_text":input_text,
                "decoding_text":output_text
            
            }


In [10]:
def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 3,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'findnitai',
        "input_lang": "en",
        "output_lang": "hi_ng",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }


In [11]:
# a = [
#     {"translation": {"en": "Hello world", "fr": "Bonjour le monde"}},
#     {"translation": {"en": "How are you?", "fr": "Comment ça va?"}},
#     {"translation": {"en": "I am fine.", "fr": "Je vais bien."}},
#     {"translation": {"en": "Thank you.", "fr": "Merci."}},
#     {"translation": {"en": "Goodbye!", "fr": "Au revoir!"}}
# ]


In [12]:
def get_all_sentences(ds,lang):
    for item in ds:
        yield item['translation'][lang]

In [13]:
def get_or_build_tokenizer(ds,lang):
    tokenizer_path=Path(f'token{lang}.json')
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
         tokenizer=Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [22]:
def get_ds(config):
    #data_set_path form hugging face
    raw_data =load_dataset(f"findnitai/english-to-hinglish",split='train')
    ## Tokenization of data set
    token_input=get_or_build_tokenizer(raw_data,config['input_lang'])
    token_ouput=get_or_build_tokenizer(raw_data,config['output_lang'])
    ## split the data set 
    print(f"raw-----{len(raw_data)}")
    num_train_data=int(0.001*(len(raw_data)))
    print(f"train_data-----{(num_train_data)}")
    num_test_data=int(len(raw_data)-num_train_data)
    train_raw_data,test_raw_data=random_split(raw_data,[num_train_data,num_test_data])
    print(f"train_raw_data{len(train_raw_data)}")
    ### converting each text sentense into token with equal size of seqence lenght each
    train_ds=BilingualDataset(ds=train_raw_data,input_token=token_input,output_token=token_ouput,input_lang=config['input_lang'],output_lang=config['output_lang'],seq_lenght=config['seq_len'])
    test_ds=BilingualDataset(ds=test_raw_data,input_token=token_input,output_token=token_ouput,input_lang=config['input_lang'],output_lang=config['output_lang'],seq_lenght=config['seq_len'])
    ### dividing data set into batch
    print(f"train_ds{len(train_ds)}")
    train_ds_batch= DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
    test_ds_batch= DataLoader(test_ds,batch_size=1,shuffle=True)
    print(f"train_ds_batch--{len(train_ds_batch)}")
    return train_ds_batch,test_ds_batch,token_input,token_ouput
    

In [23]:
###input_vocab_size: int, output_vocab_size: int, 
    #                   input_seq_len: int, output_seq_len: int, d_model: int=512,
    # N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048
def get_model(config,vocab_input_len,vocab_output_len):
    model=build_transformer(input_vocab_size=vocab_input_len,output_vocab_size=vocab_output_len,
                           input_seq_len=config['seq_len'],output_seq_len=config['seq_len']
                           
                           )
    return model

In [24]:
def train_model(config):
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    device = torch.device(device)
    train_data,test_data,token_input,token_output=get_ds(config)
    print(len(train_data))
    model=get_model(config,token_input.get_vocab_size(),token_output.get_vocab_size())
    optimizer=torch.optim.Adam(model.parameters(),lr=0.01,eps=1e-9)
    loss_fn=nn.CrossEntropyLoss(ignore_index=token_input.token_to_id('[PAD]'),label_smoothing=0.1).to(device)
    for epoch in range(1,config['num_epochs']):
        model.train()
        batch_iterator = tqdm(train_data, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input=batch['encoding_input'].to(device)
            decoder_input = batch['decoding_input'].to(device)
            encoder_mask = batch['encodig_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device)




            encoder_output=model.encode(encoder_input,encoder_mask)
            decoder_output=model.decode(encoder_output, decoder_input,encoder_mask, decoder_mask)
            proj_output=model.project(decoder_output)
            
            label=batch['label'].to(device)
            loss=loss_fn(proj_output.view(-1,token_output.get_vocab_size()),label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            

In [25]:
if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

raw-----189102
train_data-----189
train_raw_data189
train_ds189
train_ds_batch--24
24


Processing Epoch 01: 100%|█████████████████████████████████████████████████| 24/24 [03:36<00:00,  9.01s/it, loss=8.467]
Processing Epoch 02: 100%|█████████████████████████████████████████████████| 24/24 [03:19<00:00,  8.33s/it, loss=7.513]
