<h1>Data Ingestion</h1>
<p> I will be using the opus books dataset. More specifically the english to french portion of the dataset. </p>

In [59]:
from datasets import load_dataset
### Using the Opus Books Dataset from Huggingface
def data_ingestion():
    ds = load_dataset(path="Helsinki-NLP/opus_books", name="en-fr")
    train_test_data=ds['train'].train_test_split(test_size=0.2,seed=42)
    test_data=train_test_data['test']
    train_val_split=train_test_data['train'].train_test_split(test_size=0.2,seed=42)
    train_data=train_val_split['train']
    validation_data=train_val_split['test']
    return train_data,validation_data,test_data

In [60]:
train_data,validation_data,test_data=data_ingestion()

<h5> languagewise sentence generator function </h5>

In [61]:
def get_all_sentences(ds,lang):
    for pair in ds:
        # print(pair)
        yield pair['translation'][lang]

### Build tokenizer

In [62]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
def build_tokenizer(config,ds,lang):
    tokenizer_path=Path(config['tokenizer_file'].format(lang))
    
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()
        trainer=WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=1)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer

In [63]:
## The tokenizer will be trained on the train set of the data only.
##There will be 2 separate tokenizer, one will be for english and other will be for french.
tokenizer_en=build_tokenizer({'tokenizer_file':'tokenizer_en.json'},train_data,'en')
tokenizer_fr=build_tokenizer({'tokenizer_file':'tokenizer_fr.json'},train_data,'fr')

<p> max seq len will be needed during positional embedding layer creation. Incase a input comes bigger than mex seq len during inference, we have to truncate the sequence to max seq len. </p>

In [64]:
# A custom function to get the max seq len that is possible in the dataset.
def get_max_seq_len(train_data,test_data,validation_data):
    max_len=0
    for data in train_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in test_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in validation_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))
    return max_len    

### Causal Mask Generator Utility Function

In [65]:
import torch
def causal_mask_generator(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

### Creating the dataset class

In [66]:
import torch
from torch.utils.data import Dataset
class opusDataset_En_to_Fr(Dataset):
    def __init__(self,data,tokenizer_en,tokenizer_fr):
        super().__init__()
        self.raw_data=data
        self.tokenizer_en=tokenizer_en
        self.tokenizer_fr=tokenizer_fr

        self.sos_token=torch.tensor([self.tokenizer_en.token_to_id("[SOS]")],dtype=torch.int64)  ### start of sequence token
        self.eos_token=torch.tensor([self.tokenizer_en.token_to_id("[EOS]")],dtype=torch.int64)  ### End of sequence token


    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, index):
        data_en=self.raw_data[index]['translation']['en']
        data_fr=self.raw_data[index]['translation']['fr']
        encoded_data_en=torch.tensor(self.tokenizer_en.encode(data_en).ids,dtype=torch.int64)
        encoded_data_fr=torch.tensor(self.tokenizer_fr.encode(data_fr).ids,dtype=torch.int64)

        final_encoded_en=torch.cat([
            encoded_data_en,
            self.eos_token,
        ]
        )
        final_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
        ])

        target_encoded_fr=torch.cat([
            encoded_data_fr,
            self.eos_token,
        ])

        return {
            'encoder_input':final_encoded_en,
            'decoder_input':final_encoded_fr,
            'target_output':target_encoded_fr,
            'src_sentence':data_en,
            'tgt_sentence':data_fr,
        }

In [67]:
train_dataset=opusDataset_En_to_Fr(train_data,tokenizer_en,tokenizer_fr)
test_dataset=opusDataset_En_to_Fr(test_data,tokenizer_en,tokenizer_fr)
validation_dataset=opusDataset_En_to_Fr(validation_data,tokenizer_en,tokenizer_fr)

### Creating the Dataloader class
<p> At first I will be creating a custom collate function for the loader, to pad the sequence of a batch to even length. </p>

In [68]:
def custom_collate(input_list):

    max_length_in_en_batch=0
    max_length_in_fr_batch=0
    max_length_in_tfr_batch=0

    for data in input_list:
        max_length_in_en_batch=max(max_length_in_en_batch,len(data['encoder_input']))
        max_length_in_fr_batch=max(max_length_in_fr_batch,len(data['decoder_input']))
        max_length_in_tfr_batch=max(max_length_in_tfr_batch,len(data['target_output']))

    encoder_inputs=[]
    decoder_inputs=[]
    target_outputs=[]
    encoder_masks=[]
    decoder_masks=[]
    src_sentences=[]
    tgt_sentences=[]

    pad_en=torch.tensor([tokenizer_en.token_to_id('[PAD]')])
    pad_fr=torch.tensor([tokenizer_fr.token_to_id('[PAD]')])
    

    for data in input_list:
        
        encoder_input= torch.cat(
            [
                data['encoder_input'],
                torch.full((max_length_in_en_batch-len(data['encoder_input']),),pad_en.item())
            ]
        )

        decoder_input= torch.cat(
            [
                data['decoder_input'],
                torch.full((max_length_in_fr_batch-len(data['decoder_input']),),pad_fr.item())
            ]
        )

        target_output = torch.cat(
            [
                data['target_output'],
                torch.full((max_length_in_fr_batch-len(data['target_output']),),pad_fr.item())
            ]
        )
        encoder_mask=(encoder_input!=pad_en.item()).unsqueeze(0).unsqueeze(0).int()
        decoder_mask=((decoder_input!=pad_fr.item()).unsqueeze(0).unsqueeze(0).int()) & causal_mask_generator(len(decoder_input))



        encoder_inputs.append(encoder_input)
        decoder_inputs.append(decoder_input)
        target_outputs.append(target_output)
        encoder_masks.append(encoder_mask)
        decoder_masks.append(decoder_mask)
        src_sentences.append(data['src_sentence'])
        tgt_sentences.append(data['tgt_sentence'])

    return{

        'encoder_input':torch.stack(encoder_inputs),
        'decoder_input':torch.stack(decoder_inputs),
        'target_output':torch.stack(target_outputs),
        'encoder_mask':torch.stack(encoder_masks),
        'decoder_mask':torch.stack(decoder_masks),
        'src_sentence': src_sentences,
        'tgt_sentence': tgt_sentences
    }

In [69]:
from torch.utils.data import DataLoader
train_loader=DataLoader(train_dataset,batch_size=2,shuffle=True,collate_fn=custom_collate)
test_loader=DataLoader(test_dataset,batch_size=2,shuffle=False,collate_fn=custom_collate)
val_loader=DataLoader(validation_dataset,batch_size=2,shuffle=False,collate_fn=custom_collate)

In [70]:
for batch in train_loader:
    print(batch['encoder_input'].shape)
    print(batch['decoder_input'].shape)
    print(batch['target_output'].shape)
    print(batch['encoder_mask'].shape)
    print(batch['decoder_mask'].shape)
    break

torch.Size([2, 20])
torch.Size([2, 23])
torch.Size([2, 23])
torch.Size([2, 1, 1, 20])
torch.Size([2, 1, 23, 23])


### Modeling Architecture

In [71]:
import torch
import torch.nn as nn
import math

In [72]:
class inputEmbeddingLayer(nn.Module):
    def __init__(self,vocab_size,emb_dim):
        super().__init__()
        self.vocab_size=vocab_size
        self.emb_dim=emb_dim
        self.embedding_layer=nn.Embedding(self.vocab_size,self.emb_dim,dtype=torch.float32)
    def forward(self,x):
        embeddings=self.embedding_layer(x)
        return embeddings*math.sqrt(self.emb_dim)

In [73]:
class positionalEncodingLayer(nn.Module):
    def __init__(self,max_seq_len,emb_dim,dropout):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        static_positional_info=torch.zeros((self.max_seq_len,self.emb_dim),dtype=torch.float32)
        positions=torch.arange(0,max_seq_len,dtype=torch.float32).reshape(max_seq_len,-1)
        indices_for_denominator=torch.arange(0,emb_dim,2,dtype=torch.float32) ### 2i
        denominators=torch.exp((-2*indices_for_denominator*math.log(1e4))/emb_dim)
        static_positional_info[:,0::2]=torch.sin(positions*denominators)
        static_positional_info[:,1::2]=torch.cos(positions*denominators)
        self.register_buffer('static_positional_info',static_positional_info)
    def forward(self,x):
        position_encoded_embedding=x+self.static_positional_info[:x.shape[1],:]
        dropped_embeddings=self.dropout(position_encoded_embedding)
        return dropped_embeddings

In [74]:
class multiHeadAttentionBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,dropout):
        super().__init__()
        assert emb_dim%n_heads==0 ### checking if multi head splitting is possible.
        self.w_q=nn.Linear(emb_dim,emb_dim,dtype=torch.float32)
        self.w_k=nn.Linear(emb_dim,emb_dim,dtype=torch.float32)
        self.w_v=nn.Linear(emb_dim,emb_dim,dtype=torch.float32)
        self.w_o=nn.Linear(emb_dim,emb_dim,dtype=torch.float32) ### multi-head-projection-layer
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        self.single_head_dim=self.emb_dim//n_heads
        self.n_heads=n_heads

    @staticmethod
    def contextual_embedding(m_q,m_k,m_v,per_head_emb_dim,mask=None,dropout=None):
        ### return contexual embedding and attention scores
        attention_scores=(m_q@m_k.transpose(-1,-2))/math.sqrt(per_head_emb_dim)
        ##batch,head,seq_f,dim @ batch,head,dim,seqe==batch,head,seq_f,seq_e
        if mask is not None:
            attention_scores.masked_fill_(mask==0,value=float('-inf'))
        normalized_attention_scores=torch.softmax(attention_scores,dim=-1)

        if dropout is not None:
            dropped_normalized_attention_scores=dropout(normalized_attention_scores)
        ### batch,head,seq_f,seq_e @ batch,head,seqe,dim=batch,head,seqf,dim
        contexual_embeddings=dropped_normalized_attention_scores@m_v
        return dropped_normalized_attention_scores,contexual_embeddings
    
    def forward(self,q,k,v,mask=None):
        query=self.w_q(q) ### batch, seqeunce, dim
        key=self.w_k(k)  ### batch, seq,dim
        value=self.w_v(v) ##batch,seq, dim

        multihead_query=query.view(query.shape[0],query.shape[1],self.n_heads,self.single_head_dim).transpose(-3,-2) ##batch,seq,head,dim -> batch,head,seq,dim
        multihead_key=key.view(key.shape[0],key.shape[1],self.n_heads,self.single_head_dim).transpose(-3,-2) ##batch,seq,head,dim -> batch,head,seq,dim
        multihead_value=value.view(value.shape[0],value.shape[1],self.n_heads,self.single_head_dim).transpose(-3,-2) ## batch, seq,head,dim
        _,contextual_embeddings=multiHeadAttentionBlock.contextual_embedding(multihead_query,multihead_key,multihead_value,self.single_head_dim,mask,self.dropout) # return batch,head,seq,dim
        final_contextual_embeddings=contextual_embeddings.transpose(-3,-2).contiguous().view(query.shape[0],query.shape[1],self.n_heads*self.single_head_dim) ### 
        multihead_final_contextual_embedding_proj=self.w_o(final_contextual_embeddings)
        dropped_multihead_final_contextual_embedding_proj=self.dropout(multihead_final_contextual_embedding_proj)
        return dropped_multihead_final_contextual_embedding_proj

In [75]:
class layerNormalizationBlock(nn.Module):
    def __init__(self,emb_dim,eps=1e-5):
        super().__init__()
        self.scale=nn.Parameter(torch.ones(emb_dim,dtype=torch.float32))
        self.shift=nn.Parameter(torch.zeros(emb_dim,dtype=torch.float32))
        self.eps=eps

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        standard_deviation=x.std(dim=-1,keepdim=True,unbiased=False)
        normalized_x=(x-mean)/(standard_deviation+self.eps)
        scale_n_shift=self.scale*normalized_x+self.shift
        return scale_n_shift

In [76]:
class skipConnection(nn.Module):
    def __init__(self,dropout):
        super().__init__()
        self.dropout=nn.Dropout(dropout)

    def forward(self,x,sublayer):
        output=x+sublayer(x)
        dropped_output=self.dropout(output)
        return dropped_output

In [77]:
class feed_forward_block(nn.Module):
    ### Expansion Contraction layer.....
    def __init__(self,emb_dim,expand_dim,dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.expand_dim=expand_dim
        self.dropout=dropout
        self.network=nn.Sequential(
            nn.Linear(emb_dim,expand_dim,dtype=torch.float32),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(expand_dim,emb_dim,dtype=torch.float32),
        )
    def forward(self,x):
        output=self.network(x)
        return output

In [78]:
class encoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout)
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(2)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(2)])
    def forward(self,x,encoder_mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block(x,x,x,encoder_mask))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,self.feed_forward_block)
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        return layer_normalized_output2

In [79]:
class encoder(nn.Module):
    def __init__(self,no_of_enc_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.enc_blks=nn.ModuleList([encoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_enc_blk)])
    def forward(self,x,encoder_mask=None):
        for blk in self.enc_blks:
            x=blk(x,encoder_mask)
        return x

In [80]:
class decoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block1=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) ### casual attention block
        self.mha_block2=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) #### cross attention block
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(3)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(3)])
    def forward(self,x,enc_out,decoder_mask=None,encoder_mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block1(x,x,x,decoder_mask)) ### causal attention
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,lambda x: self.mha_block2(x,enc_out,enc_out,encoder_mask))### Query from decoder, k and v from encoder.
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        output3=self.skip_connections[2](layer_normalized_output2,self.feed_forward_block)
        layer_normalized_output3=self.layerNormalizationBlocks[2](output3)
        return layer_normalized_output3

In [81]:
class decoder(nn.Module):
    def __init__(self,no_of_dec_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.dec_blks=nn.ModuleList([decoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_dec_blk)])
    def forward(self,x,encoder_output,decoder_mask=None,encoder_mask=None):
        for blk in self.dec_blks:
            x=blk(x,encoder_output,decoder_mask,encoder_mask)
        return x

In [82]:
class finalProjectionLayer(nn.Module):
    def __init__(self,emb_dim,vocab_size):
        super().__init__()
        self.linear=nn.Linear(emb_dim,vocab_size,dtype=torch.float32)
    
    def forward(self,x):
        output=self.linear(x)
        return output

In [83]:
class transformers(nn.Module):
    def __init__(self,model_config,tokenizer_config):
        super().__init__()
        self.encoder_emb_layer=inputEmbeddingLayer(tokenizer_config['en_vocab_size'],model_config['enc_cfg']['emb_dim'])
        self.enc_positional_emb_layer=positionalEncodingLayer(model_config['enc_max_seq_len'],model_config['enc_cfg']['emb_dim'],model_config['enc_cfg']['pos_emb_dropout'])
        self.encoder=encoder(
            no_of_enc_blk=model_config['enc_cfg']['no_of_enc_blk'],
            emb_dim=model_config['enc_cfg']['emb_dim'],
            n_heads=model_config['enc_cfg']['n_heads'],
            mha_dropout=model_config['enc_cfg']['mha_dropout'],
            expand_dim=model_config['enc_cfg']['expand_dim'],
            ff_dropout=model_config['enc_cfg']['ff_dropout'],
            sk_dropout=model_config['enc_cfg']['sk_dropout']
        )
        
        self.decoder_emb_layer=inputEmbeddingLayer(tokenizer_config['fr_vocab_size'],model_config['dec_cfg']['emb_dim'])
        self.dec_positional_emb_layer=positionalEncodingLayer(model_config['dec_max_seq_len'],model_config['dec_cfg']['emb_dim'],model_config['dec_cfg']['pos_emb_dropout'])
        self.decoder=decoder(
            no_of_dec_blk=model_config['dec_cfg']['no_of_dec_blk'],
            emb_dim=model_config['dec_cfg']['emb_dim'],
            n_heads=model_config['dec_cfg']['n_heads'],
            mha_dropout=model_config['dec_cfg']['mha_dropout'],
            expand_dim=model_config['dec_cfg']['expand_dim'],
            ff_dropout=model_config['dec_cfg']['ff_dropout'],
            sk_dropout=model_config['dec_cfg']['sk_dropout']
        )
        self.decoder_final_projection=finalProjectionLayer(model_config['dec_cfg']['emb_dim'],tokenizer_config['fr_vocab_size'])

    def encode(self,encoder_input,encoder_mask=None):
        encoder_input_embedding=self.encoder_emb_layer(encoder_input)
        positional_encoded_input_embedding=self.enc_positional_emb_layer(encoder_input_embedding)
        encoder_contexual_embedding=self.encoder(positional_encoded_input_embedding,encoder_mask)
        return encoder_contexual_embedding

    def decode(self,decoder_input,encoder_output,decoder_mask,encoder_mask):
        decoder_input_embedding=self.decoder_emb_layer(decoder_input)
        positional_encoded_input_embedding=self.dec_positional_emb_layer(decoder_input_embedding) ### of decoder
        decoder_contexual_embedding=self.decoder(positional_encoded_input_embedding,encoder_output,decoder_mask,encoder_mask)
        final_output=self.decoder_final_projection(decoder_contexual_embedding)
        return final_output
    
    ###forward will be used during training.
    def forward(self,decoder_input,encoder_input,decoder_mask,encoder_mask):
        encoder_output=self.encode(encoder_input,encoder_mask)
        decoder_output=self.decode(decoder_input,encoder_output,decoder_mask,encoder_mask)
        return decoder_output

### creating configs for building model

In [84]:
# Model configuration dictionary
model_config = {
    "enc_max_seq_len": get_max_seq_len(train_data,validation_data,test_data)+2,   # Max source sequence length  Extra +2 for start and end token
    "dec_max_seq_len": get_max_seq_len(train_data,validation_data,test_data)+2,   # Max target sequence length
    "enc_cfg": {
        "emb_dim": 256,
        "no_of_enc_blk": 3,
        "n_heads": 4,
        "pos_emb_dropout":0.1,
        "mha_dropout": 0.1,
        "expand_dim": 1024,
        "ff_dropout": 0.1,
        "sk_dropout": 0.1
    },
    "dec_cfg": {
        "emb_dim": 256, ### Embedding dimention
        "no_of_dec_blk": 3, ## No of decoder block
        "pos_emb_dropout":0.1, # dropout rate after applying  positional embedding.
        "n_heads": 4, ### no of head
        "mha_dropout": 0.1, ### multihead attentions dropout in attention scores
        "expand_dim": 1024, ### fee forward intermediate expand dim
        "ff_dropout": 0.1, ### feed forward layer dropout
        "sk_dropout": 0.1 ##skip connections dropout
    }
}

# Tokenizer configuration dictionary
tokenizer_config = {
    # "vocab_size": 30000,  # Vocabulary size of source & target tokenizer
    'en_vocab_size':tokenizer_en.get_vocab_size(),
    'fr_vocab_size':tokenizer_fr.get_vocab_size()
}

### building model and weight initialization

In [85]:
Transformer=transformers(model_config,tokenizer_config)

In [86]:
for p in Transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

In [87]:
# training_config={
#     'epochs':3,
#     'optimizer_lr':1e-4,
#     'optimizer_eps':1e-8,
#     'label_smoothing':0.1,
#     'batch_size':2,
# }

### Model details

In [88]:
import torch

def model_summary(model):
    print("="*80)
    print(f"{'Model Summary':^80}")
    print("="*80)

    total_params = 0
    trainable_params = 0

    print(f"{'Layer Name':40s} {'Param #':>12s} {'Trainable':>12s} {'Shape':>20s}")
    print("-"*80)

    for name, param in model.named_parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
        shape = tuple(param.shape)
        print(f"{name:40s} {num_params:12,d} {str(param.requires_grad):>12s} {str(shape):>20s}")

    print("-"*80)
    print(f"Total Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Non-trainable Parameters: {total_params - trainable_params:,}")

    total_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
    print(f"Model Size (approx): {total_size_bytes / (1024**2):.2f} MB")
    print("="*80)

# Example usage
# model = Transformer(...)
model_summary(Transformer)

                                 Model Summary                                  
Layer Name                                    Param #    Trainable                Shape
--------------------------------------------------------------------------------
encoder_emb_layer.embedding_layer.weight    7,680,000         True         (30000, 256)
encoder.enc_blks.0.mha_block.w_q.weight        65,536         True           (256, 256)
encoder.enc_blks.0.mha_block.w_q.bias             256         True               (256,)
encoder.enc_blks.0.mha_block.w_k.weight        65,536         True           (256, 256)
encoder.enc_blks.0.mha_block.w_k.bias             256         True               (256,)
encoder.enc_blks.0.mha_block.w_v.weight        65,536         True           (256, 256)
encoder.enc_blks.0.mha_block.w_v.bias             256         True               (256,)
encoder.enc_blks.0.mha_block.w_o.weight        65,536         True           (256, 256)
encoder.enc_blks.0.mha_block.w_o.bias         

### Training Pipeline

In [89]:
@torch.no_grad()
def get_prediction_accuracy(predictions,targets):
    prediction_tokens=predictions.argmax(dim=-1)
    mask=targets!=tokenizer_fr.token_to_id('[PAD]')
    correct=(prediction_tokens==targets) & mask
    acc=correct.sum().float()/mask.sum().float()
    return acc

In [94]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
loss_fn=nn.CrossEntropyLoss(ignore_index=tokenizer_fr.token_to_id('[PAD]'),label_smoothing=0.1)
optimizer=optim.Adam(Transformer.parameters(),lr=1e-5,eps=1e-8)
epochs=3
global_step=0
for epoch in range(epochs):

    ### Train cycle for each epcoh, loss calculation. Train loss.
    Transformer.train()
    train_loss=0.0
    train_acc=0.0
    train_loader_tqdm=tqdm(train_loader,desc="Training Epoch",leave=False)
    for i,batch in enumerate(train_loader_tqdm):
        encoder_input=batch['encoder_input']
        decoder_input=batch['decoder_input']
        target_output=batch['target_output']
        encoder_mask=batch['encoder_mask']
        decoder_mask=batch['decoder_mask']
        decoder_output=Transformer(decoder_input,encoder_input,decoder_mask,encoder_mask)
        
        acc=get_prediction_accuracy(decoder_output,target_output)
        train_acc+=acc
        loss=loss_fn(decoder_output.reshape(-1,decoder_output.shape[-1]),target_output.reshape(-1))
        train_loss+=loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loader_tqdm.set_postfix({f"{i+1}th Train batch loss: ":loss.item(),f"{i+1}th Train batch acc: ":acc.item()})
        
        ###adding below step to quick check entire loop.
        # global_step+=1
        # if global_step>=1:
        #     break    

    train_loss/=len(train_loader)
    train_acc/=len(train_loader)
    ### for val loss
    validation_loss=0.0
    validation_acc=0.0
    with torch.no_grad():
        Transformer.eval()
        validation_loader_tqdm=tqdm(val_loader,desc="Validation Epoch",leave=False)
        for i,batch in enumerate(validation_loader_tqdm):
            encoder_input=batch['encoder_input']
            decoder_input=batch['decoder_input']
            target_output=batch['target_output']
            encoder_mask=batch['encoder_mask']
            decoder_mask=batch['decoder_mask']
            decoder_output=Transformer(decoder_input,encoder_input,decoder_mask,encoder_mask)
             
            acc=get_prediction_accuracy(decoder_output,target_output)
            validation_acc+=acc

            loss=loss_fn(decoder_output.reshape(-1,decoder_output.shape[-1]),target_output.reshape(-1))
            validation_loss+=loss.item()
            validation_loader_tqdm.set_postfix({f"{i+1}th validation batch loss: ":loss.item(),f"{i+1}th validation batch acc: ":acc.item()})
            
            # if global_step>=1:
            #     break
    validation_loss/=len(val_loader)
    validation_acc/=len(val_loader)
    print(f"Epoch: {epoch+1} | Train loss: {train_loss} | Train acc: {train_acc} | Validation loss: {validation_loss} | Validation acc: {validation_acc}")

Training Epoch:   0%|          | 0/40667 [00:00<?, ?it/s]

                                                                                                                                       

KeyboardInterrupt: 

In [95]:
def encoder_preprocess(text):
    encoded_english_sentence=tokenizer_en.encode(text).ids
    # print(tokenizer_en.decode(encoded_english_sentence))
    processed_text=torch.cat(
        [
            torch.tensor(encoded_english_sentence,dtype=torch.int64),
            torch.tensor([tokenizer_en.token_to_id('[EOS]')],dtype=torch.int64),
        ]
    )
    return processed_text

### inference pipeline for 1 sequence

In [119]:
### For single input
with torch.inference_mode():
    max_seq_len=432
    english_sentence="I am who I am"
    processed_encoded_english_sentence=encoder_preprocess(english_sentence).unsqueeze(0)
    # print(processed_encoded_english_sentence.shape) ### batch, seq
    encoder_mask=(processed_encoded_english_sentence!=tokenizer_en.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int()
    # print(encoder_mask.shape) #batch,head,seq,dim
    encoder_output=Transformer.encode(processed_encoded_english_sentence,encoder_mask)
    # # print(encoder_output.shape)
    decoder_input=torch.tensor([tokenizer_fr.encode("[SOS]").ids])  #seq
    # print(decoder_input.shape)
    i=0
    while True:

        if decoder_input.shape[1]>=max_seq_len:
            break

        decoder_mask= ((decoder_input!=torch.tensor(tokenizer_fr.token_to_id("[PAD]"))).unsqueeze(0).unsqueeze(0)) & casual_mask_generator(decoder_input.shape[1])

        # print(decoder_input.shape,decoder_mask.shape)
        decoder_output=Transformer.decode(decoder_input,encoder_output,decoder_mask,encoder_mask)
        ##batch,seq,vocab
        # print(decoder_output.shape)
        predicted_word=torch.argmax(decoder_output[:,-1,:],dim=-1)

        decoder_input=torch.cat(
            [
                decoder_input.squeeze(0),
                torch.tensor([predicted_word])
            ]
        )

        # print(decoder_input)
        decoder_input=decoder_input.unsqueeze(0)

        if predicted_word==tokenizer_fr.token_to_id("[EOS]") or predicted_word==tokenizer_fr.token_to_id("[PAD]"):
            break

        print(i)
        i+=1
        if i==10:
            break

    predicted_sentence=tokenizer_fr.decode(decoder_input.squeeze(0).tolist(),skip_special_tokens=False)
    print(decoder_input)
    print(predicted_sentence)

0
1
2
3
4
5
6
7
8
9
tensor([[2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]])
[SOS] , , , , , , , , , ,
