<h1>Data Ingestion</h1>

In [33]:
from datasets import load_dataset
### Using the Opus Books Dataset from Huggingface
def data_ingestion():
    ds = load_dataset(path="Helsinki-NLP/opus_books", name="en-fr")
    train_test_data=ds['train'].train_test_split(test_size=0.2,seed=42)
    test_data=train_test_data['test']
    train_val_split=train_test_data['train'].train_test_split(test_size=0.2,seed=42)
    train_data=train_val_split['train']
    validation_data=train_val_split['test']
    return train_data,validation_data,test_data

In [34]:
train_data,validation_data,test_data=data_ingestion()

<b> creating a data generator function, it will be needed when building tokenizer</b>

In [35]:
def get_all_sentences(ds,lang):
    for pair in ds:
        # print(pair)
        yield pair['translation'][lang]

<h5>Building Tokenizer</h5>

In [36]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
def build_tokenizer(config,ds,lang):
    tokenizer_path=Path(config['tokenizer_file'].format(lang))
    
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()
        trainer=WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=1)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer

In [37]:
## THe tokenizer will be trained on the train set of the data only.
##There will be 2 separate tokenizer, one will be for english and other will be for french.
tokenizer_en=build_tokenizer({'tokenizer_file':'tokenizer_en.json'},train_data,'en')
tokenizer_fr=build_tokenizer({'tokenizer_file':'tokenizer_fr.json'},train_data,'fr')

### creating max sequence len utility function

In [38]:
# A custom function to get the max seq len that is possible in the dataset.
def get_max_seq_len(train_data,test_data,validation_data):
    max_len=0
    for data in train_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in test_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in validation_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))
    return max_len    

### Causal Mask Generator Utility Function

In [39]:
import torch
def casual_mask_generator(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

### Creating the Dataset Class

In [40]:
import torch
from torch.utils.data import Dataset
class opusDataset_En_to_Fr(Dataset):
    def __init__(self,data,max_seq_len,tokenizer_en,tokenizer_fr):
        super().__init__()
        self.raw_data=data
        self.tokenizer_en=tokenizer_en
        self.tokenizer_fr=tokenizer_fr


        ### Goal Shoould be to set a max length that fits all the sequence..
        self.max_seq_len=max_seq_len
        self.sos_token=torch.tensor([self.tokenizer_en.token_to_id("[SOS]")],dtype=torch.int64)
        self.eos_token=torch.tensor([self.tokenizer_en.token_to_id("[EOS]")],dtype=torch.int64)
        self.pad_token=torch.tensor([self.tokenizer_en.token_to_id("[PAD]")],dtype=torch.int64)


    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, index):
        data_en=self.raw_data[index]['translation']['en']
        data_fr=self.raw_data[index]['translation']['fr']
        encoded_data_en=torch.tensor(self.tokenizer_en.encode(data_en).ids,dtype=torch.int64)
        encoded_data_fr=torch.tensor(self.tokenizer_fr.encode(data_fr).ids,dtype=torch.int64)
        expected_seq_len=self.max_seq_len+2 ### +2 for sos and eos token

        final_encoded_en=torch.cat([
            self.sos_token,
            encoded_data_en,
            self.eos_token,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_en)-2)),

        ]
        )
        final_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_fr)-1))
        ])


        target_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
            self.eos_token,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_fr)-2)),
        ])



        return {
            'encoder_input':final_encoded_en,
            'decoder_input':final_encoded_fr,
            'target_output':target_encoded_fr,
            'encoder_mask': (final_encoded_en!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decodar_mask': (target_encoded_fr!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask_generator(len(target_encoded_fr)),
            'src_sentance':data_en,
            'tgt_sentence':data_fr,
        }

In [41]:
max_seq_len=get_max_seq_len(train_data,validation_data,test_data) ### using the utility function that we created earlier.
train_dataset=opusDataset_En_to_Fr(train_data,max_seq_len,tokenizer_en,tokenizer_fr)
test_dataset=opusDataset_En_to_Fr(test_data,max_seq_len,tokenizer_en,tokenizer_fr)
validation_dataset=opusDataset_En_to_Fr(validation_data,max_seq_len,tokenizer_en,tokenizer_fr)

In [42]:
from torch.utils.data import DataLoader
train_loader=DataLoader(train_dataset,batch_size=2,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=2,shuffle=False)
val_loader=DataLoader(validation_dataset,batch_size=2,shuffle=False)

### Coding the Transformer architecture

In [43]:
import torch
import torch.nn as nn
import math

In [44]:
class inputEmbeddingLayer(nn.Module):
    def __init__(self,vocab_size,emb_dim):
        super().__init__()
        self.vocab_size=vocab_size
        self.emb_dim=emb_dim
        self.embedding_layer=nn.Embedding(self.vocab_size,self.emb_dim,dtype=torch.float16)
    def forward(self,x):
        embeddings=self.embedding_layer(x)
        return embeddings*math.sqrt(self.emb_dim)

In [45]:
class positionalEncodingLayer(nn.Module):
    def __init__(self,max_seq_len,emb_dim,dropout):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        static_positional_info=torch.zeros((self.max_seq_len,self.emb_dim),dtype=torch.float16)
        positions=torch.arange(0,max_seq_len,dtype=torch.float16).reshape(max_seq_len,-1)
        indices_for_denominator=torch.arange(0,emb_dim,2,dtype=torch.float16) ### 2i
        denominators=torch.exp((-2*indices_for_denominator*math.log(1e4))/emb_dim)
        static_positional_info[:,0::2]=torch.sin(positions*denominators)
        static_positional_info[:,1::2]=torch.cos(positions*denominators)
        self.register_buffer('static_positional_info',static_positional_info)
    def forward(self,x):
        position_encoded_embedding=x+self.static_positional_info[:x.shape[1],:]
        dropped_embeddings=self.dropout(position_encoded_embedding)
        return dropped_embeddings

In [46]:
class multiHeadAttentionBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,dropout):
        super().__init__()
        assert emb_dim%n_heads==0 ### checking if multi head splitting is possible.
        self.w_q=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_k=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_v=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_o=nn.Linear(emb_dim,emb_dim,dtype=torch.float16) ### multi-head-projection-layer
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        self.single_head_dim=self.emb_dim//n_heads
        self.n_heads=n_heads

    @staticmethod
    def contextual_embedding(m_q,m_k,m_v,per_head_emb_dim,mask=None,dropout=None):
        ### return contexual embedding and attention scores
        attention_scores=m_q@m_k.transpose(2,3)/math.sqrt(per_head_emb_dim)
        ##batch,head,seq,dim @ batch,head,dim,seq==batch,head,seq,seq
        if mask is not None:
            attention_scores.masked_fill_(mask==0,value=float('-inf'))
        normalized_attention_scores=torch.softmax(attention_scores,dim=-1)

        if dropout is not None:
            dropped_normalized_attention_scores=dropout(normalized_attention_scores)
        ### batch,head,seq,seq @ batch,head,seq,dim=batch,head,seq,dim
        contexual_embeddings=dropped_normalized_attention_scores@m_v
        return dropped_normalized_attention_scores,contexual_embeddings
    
    def forward(self,q,k,v,mask=None):
        query=self.w_q(q) ### batch, seqeunce, dim
        key=self.w_k(k)
        value=self.w_v(v)

        multihead_query=query.view(query.shape[0],query.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_key=key.view(key.shape[0],key.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_value=value.view(value.shape[0],value.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        _,contextual_embeddings=multiHeadAttentionBlock.contextual_embedding(multihead_query,multihead_key,multihead_value,self.single_head_dim,mask,self.dropout)
        final_contextual_embeddings=contextual_embeddings.transpose(1,2).contiguous().view(value.shape[0],value.shape[1],self.n_heads*self.single_head_dim)
        multihead_final_contextual_embedding_proj=self.w_o(final_contextual_embeddings)
        dropped_multihead_final_contextual_embedding_proj=self.dropout(multihead_final_contextual_embedding_proj)
        return dropped_multihead_final_contextual_embedding_proj

In [47]:
class layerNormalizationBlock(nn.Module):
    def __init__(self,emb_dim,eps=1e-5):
        super().__init__()
        self.scale=nn.Parameter(torch.ones(emb_dim,dtype=torch.float16))
        self.shift=nn.Parameter(torch.zeros(emb_dim,dtype=torch.float16))
        self.eps=eps

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        standard_deviation=x.std(dim=-1,keepdim=True,unbiased=False)
        normalized_x=(x-mean)/(standard_deviation+self.eps)
        scale_n_shift=self.scale*normalized_x+self.shift
        return scale_n_shift

In [48]:
class skipConnection(nn.Module):
    def __init__(self,dropout):
        super().__init__()
        self.dropout=nn.Dropout(dropout)

    def forward(self,x,sublayer):
        output=x+sublayer(x)
        dropped_output=self.dropout(output)
        return dropped_output

In [49]:
class feed_forward_block(nn.Module):
    ### Expansion Contraction layer.....
    def __init__(self,emb_dim,expand_dim,dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.expand_dim=expand_dim
        self.dropout=dropout
        self.network=nn.Sequential(
            nn.Linear(emb_dim,expand_dim,dtype=torch.float16),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(expand_dim,emb_dim,dtype=torch.float16),
        )
    def forward(self,x):
        output=self.network(x)
        return output

In [50]:
class encoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout)
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(2)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(2)])
    def forward(self,x,encoder_mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block(x,x,x,encoder_mask))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,self.feed_forward_block)
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        return layer_normalized_output2

In [51]:
class encoder(nn.Module):
    def __init__(self,no_of_enc_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.enc_blks=nn.ModuleList([encoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_enc_blk)])
    def forward(self,x,encoder_mask=None):
        for blk in self.enc_blks:
            x=blk(x,encoder_mask)
        return x

In [52]:
class decoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block1=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) ### casual attention block
        self.mha_block2=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) #### cross attention block
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(3)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(3)])
    def forward(self,x,enc_out,decoder_mask=None,encoder_mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block1(x,x,x,decoder_mask)) ### causal attention
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,lambda x: self.mha_block2(x,enc_out,enc_out,encoder_mask))### Query from decoder, k and v from encoder.
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        output3=self.skip_connections[2](layer_normalized_output2,self.feed_forward_block)
        layer_normalized_output3=self.layerNormalizationBlocks[2](output3)
        return layer_normalized_output3

In [53]:
class decoder(nn.Module):
    def __init__(self,no_of_dec_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.dec_blks=nn.ModuleList([decoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_dec_blk)])
    def forward(self,x,encoder_output,decoder_mask=None,encoder_mask=None):
        for blk in self.dec_blks:
            x=blk(x,encoder_output,decoder_mask,encoder_mask)
        return x

In [54]:
class finalProjectionLayer(nn.Module):
    def __init__(self,emb_dim,vocab_size):
        super().__init__()
        self.linear=nn.Linear(emb_dim,vocab_size,dtype=torch.float16)
    
    def forward(self,x):
        output=self.linear(x)
        return output

In [55]:
class transformers(nn.Module):
    def __init__(self,model_config,tokenizer_config):
        super().__init__()
        self.encoder_emb_layer=inputEmbeddingLayer(tokenizer_config['en_vocab_size'],model_config['enc_cfg']['emb_dim'])
        self.enc_positional_emb_layer=positionalEncodingLayer(model_config['enc_max_seq_len'],model_config['enc_cfg']['emb_dim'],model_config['enc_cfg']['pos_emb_dropout'])
        self.encoder=encoder(
            no_of_enc_blk=model_config['enc_cfg']['no_of_enc_blk'],
            emb_dim=model_config['enc_cfg']['emb_dim'],
            n_heads=model_config['enc_cfg']['n_heads'],
            mha_dropout=model_config['enc_cfg']['mha_dropout'],
            expand_dim=model_config['enc_cfg']['expand_dim'],
            ff_dropout=model_config['enc_cfg']['ff_dropout'],
            sk_dropout=model_config['enc_cfg']['sk_dropout']
        )
        
        self.decoder_emb_layer=inputEmbeddingLayer(tokenizer_config['fr_vocab_size'],model_config['dec_cfg']['emb_dim'])
        self.dec_positional_emb_layer=positionalEncodingLayer(model_config['dec_max_seq_len'],model_config['dec_cfg']['emb_dim'],model_config['dec_cfg']['pos_emb_dropout'])
        self.decoder=decoder(
            no_of_dec_blk=model_config['dec_cfg']['no_of_dec_blk'],
            emb_dim=model_config['dec_cfg']['emb_dim'],
            n_heads=model_config['dec_cfg']['n_heads'],
            mha_dropout=model_config['dec_cfg']['mha_dropout'],
            expand_dim=model_config['dec_cfg']['expand_dim'],
            ff_dropout=model_config['dec_cfg']['ff_dropout'],
            sk_dropout=model_config['dec_cfg']['sk_dropout']
        )
        self.decoder_final_projection=finalProjectionLayer(model_config['dec_cfg']['emb_dim'],tokenizer_config['fr_vocab_size'])

    def encode(self,encoder_input,encoder_mask=None):
        encoder_input_embedding=self.encoder_emb_layer(encoder_input)
        positional_encoded_input_embedding=self.enc_positional_emb_layer(encoder_input_embedding)
        encoder_contexual_embedding=self.encoder(positional_encoded_input_embedding,encoder_mask)
        return encoder_contexual_embedding

    def decode(self,decoder_input,encoder_output,decoder_mask,encoder_mask):
        decoder_input_embedding=self.decoder_emb_layer(decoder_input)
        positional_encoded_input_embedding=self.dec_positional_emb_layer(decoder_input_embedding) ### of decoder
        decoder_contexual_embedding=self.decoder(positional_encoded_input_embedding,encoder_output,decoder_mask,encoder_mask)
        final_output=self.decoder_final_projection(decoder_contexual_embedding)
        return final_output
    
    ###forward will be used during training.
    def forward(self,decoder_input,encoder_input,decoder_mask,encoder_mask):
        encoder_output=self.encode(encoder_input,encoder_mask)
        decoder_output=self.decode(decoder_input,encoder_output,decoder_mask,encoder_mask)
        return decoder_output

In [56]:
# Model configuration dictionary
model_config = {
    "enc_max_seq_len": max_seq_len+2,   # Max source sequence length  Extra +2 for start and end token
    "dec_max_seq_len": max_seq_len+2,   # Max target sequence length
    "enc_cfg": {
        "emb_dim": 512,
        "no_of_enc_blk": 6,
        "n_heads": 8,
        "pos_emb_dropout":0.1,
        "mha_dropout": 0.1,
        "expand_dim": 2048,
        "ff_dropout": 0.1,
        "sk_dropout": 0.1
    },
    "dec_cfg": {
        "emb_dim": 512, ### Embedding dimention
        "no_of_dec_blk": 6, ## No of decoder block
        "pos_emb_dropout":0.1, # dropout rate after applying  positional embedding.
        "n_heads": 8, ### no of head
        "mha_dropout": 0.1, ### multihead attentions dropout in attention scores
        "expand_dim": 2048, ### fee forward intermediate expand dim
        "ff_dropout": 0.1, ### feed forward layer dropout
        "sk_dropout": 0.1 ##skip connections dropout
    }
}

# Tokenizer configuration dictionary
tokenizer_config = {
    # "vocab_size": 30000,  # Vocabulary size of source & target tokenizer
    'en_vocab_size':tokenizer_en.get_vocab_size(),
    'fr_vocab_size':tokenizer_fr.get_vocab_size()
}

In [57]:
Transformer=transformers(model_config,tokenizer_config)

In [58]:
Transformer

transformers(
  (encoder_emb_layer): inputEmbeddingLayer(
    (embedding_layer): Embedding(30000, 512)
  )
  (enc_positional_emb_layer): positionalEncodingLayer(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): encoder(
    (enc_blks): ModuleList(
      (0-5): 6 x encoderBlock(
        (mha_block): multiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_o): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): feed_forward_block(
          (network): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
   

In [60]:
for batch in train_loader:
    encoder_input=batch['encoder_input']
    decoder_input=batch['decoder_input']
    target_output=batch['target_output']
    encoder_mask=batch['encoder_mask']
    decoder_mask=batch['decodar_mask']

    decoder_output=Transformer(decoder_input,encoder_input,decoder_mask,encoder_mask)
    print(decoder_output.shape)

    break

torch.Size([2, 484, 30000])
