In [826]:
from datasets import load_dataset

### DATA PIPELINE

In [827]:
def data_ingestion():
    ds = load_dataset(path="Helsinki-NLP/opus_books", name="en-fr")
    train_test_data=ds['train'].train_test_split(test_size=0.2,seed=42)
    test_data=train_test_data['test']
    train_val_split=train_test_data['train'].train_test_split(test_size=0.2,seed=42)
    train_data=train_val_split['train']
    validation_data=train_val_split['test']
    return train_data,validation_data,test_data

In [828]:
train_data,validation_data,test_data=data_ingestion()

In [830]:
train_data
validation_data
test_data

Dataset({
    features: ['id', 'translation'],
    num_rows: 25417
})

In [831]:
train_data[0]

{'id': '61261',
 'translation': {'en': 'One morning, on awaking, she saw on her window two vases filled with flowers.',
  'fr': 'Un matin, elle vit, en s’éveillant, sur sa fenêtre, deux vases pleins de fleurs.'}}

In [832]:
train_data[0]

{'id': '61261',
 'translation': {'en': 'One morning, on awaking, she saw on her window two vases filled with flowers.',
  'fr': 'Un matin, elle vit, en s’éveillant, sur sa fenêtre, deux vases pleins de fleurs.'}}

In [833]:
#pathlib
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [834]:
def get_all_sentences(ds,lang):
    for pair in ds:
        # print(pair)
        yield pair['translation'][lang]

In [835]:
get_all_sentences(train_data,'en')

<generator object get_all_sentences at 0x000002572E168E10>

In [836]:
def build_tokenizer(config,ds,lang):
    tokenizer_path=Path(config['tokenizer_file'].format(lang))
    
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()
        trainer=WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=1)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer


In [837]:
tokenizer_en=build_tokenizer({'tokenizer_file':'tokenizer_en.json'},train_data,'en')
tokenizer_fr=build_tokenizer({'tokenizer_file':'tokenizer_fr.json'},train_data,'fr')

In [None]:
def get_max_seq_len(train_data,test_data,validation_data):
    max_len=0
    for data in train_data:
        max_len=max(max_len,len(tokenizer.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer.encode(data['translation']['fr']).ids))

    for data in test_data:
        max_len=max(max_len,len(tokenizer.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer.encode(data['translation']['fr']).ids))

    for data in validation_data:
        max_len=max(max_len,len(tokenizer.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer.encode(data['translation']['fr']).ids))

    return max_len    

In [839]:
x=torch.tensor([2,3,5,0,0,0])
x==0

tensor([False, False, False,  True,  True,  True])

In [840]:
a=tokenizer.encode("Hello mahi!")

In [841]:
type(a.ids)

list

In [842]:
max_seq_len=get_max_seq_len(train_data,test_data,validation_data)
print("Max_SEQ_LEN",max_seq_len)

Max_SEQ_LEN 482


In [843]:
import torch

torch.cat(
    [
        torch.tensor([1]),
        torch.tensor([2,3,4]),
        torch.tensor([4]*3)
    ]
)

tensor([1, 2, 3, 4, 4, 4, 4])

In [844]:
from torch.utils.data import Dataset,DataLoader

In [845]:
[4]*4

[4, 4, 4, 4]

In [846]:
class opusDataset_En_to_Fr(Dataset):
    def __init__(self,data,max_seq_len,tokenizer_en,tokenizer_fr):
        super().__init__()
        self.raw_data=data
        self.tokenizer_en=tokenizer_en
        self.tokenizer_fr=tokenizer_fr


        ### Goal Shoould be to set a max length that fits all the sequence..
        self.max_seq_len=max_seq_len
        self.sos_token=torch.tensor([self.tokenizer_en.token_to_id("[SOS]")],dtype=torch.int64)
        self.eos_token=torch.tensor([self.tokenizer_en.token_to_id("[EOS]")],dtype=torch.int64)
        self.pad_token=torch.tensor([self.tokenizer_en.token_to_id("[PAD]")],dtype=torch.int64)


    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, index):
        data_en=self.raw_data[index]['translation']['en']
        data_fr=self.raw_data[index]['translation']['fr']
        encoded_data_en=torch.tensor(self.tokenizer_en.encode(data_en).ids,dtype=torch.int64)
        encoded_data_fr=torch.tensor(self.tokenizer_fr.encode(data_fr).ids,dtype=torch.int64)
        expected_seq_len=self.max_seq_len+2

        final_encoded_en=torch.cat([
            self.sos_token,
            encoded_data_en,
            self.eos_token,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_en)-2)),

        ]
        )
        final_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_fr)-1))
        ])


        target_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
            self.eos_token,
            torch.tensor([self.pad_token]*(expected_seq_len-len(encoded_data_fr)-2)),
        ])



        return {
            'encoder_input':final_encoded_en,
            'decoder_input':final_encoded_fr,
            'target_output':target_encoded_fr,
            'encoder_mask': (final_encoded_en!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decodar_mask': (target_encoded_fr!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask_generator(len(target_encoded_fr)),
            'src_sentance':data_en,
            'tgt_sentence':data_fr,
        }

In [847]:
a=torch.tensor([[2,1,5,0,0,0]],dtype=int)
a

tensor([[2, 1, 5, 0, 0, 0]])

In [848]:
(a!=0).unsqueeze(0).unsqueeze(0)

tensor([[[[ True,  True,  True, False, False, False]]]])

In [849]:
train_dataset=opusDataset_En_to_Fr(train_data,max_seq_len,tokenizer_en,tokenizer_fr)

In [850]:
test_dataset=opusDataset_En_to_Fr(test_data,max_seq_len,tokenizer_en,tokenizer_fr)

In [851]:
validation_dataset=opusDataset_En_to_Fr(validation_data,max_seq_len,tokenizer_en,tokenizer_fr)

In [852]:
train_loader=DataLoader(train_dataset,batch_size=2,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=2,shuffle=False)
val_loader=DataLoader(validation_dataset,batch_size=2,shuffle=False)

In [853]:
i=0
for batch in train_loader:
    if i==1:
        break
    else:
        i+=1
        print(batch['encoder_input'])

tensor([[   2,   11, 3307,   23,    6,    3,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,  

In [854]:
import torch
import torch.nn as nn
import math

In [855]:
class inputEmbeddingLayer(nn.Module):
    def __init__(self,vocab_size,emb_dim):
        super().__init__()
        self.vocab_size=vocab_size
        self.emb_dim=emb_dim
        self.embedding_layer=nn.Embedding(self.vocab_size,self.emb_dim,dtype=torch.float16)
    def forward(self,x):
        embeddings=self.embedding_layer(x)
        return embeddings*math.sqrt(self.emb_dim)

In [856]:
embedding_layer=inputEmbeddingLayer(10,6)

In [857]:
test_input=torch.tensor([[1,2,3],[4,5,6]])
input_embeddings=embedding_layer(test_input)
input_embeddings

tensor([[[-0.4971,  3.7383,  3.4277, -3.5527,  2.1113,  1.4219],
         [ 1.3105, -1.2568, -1.1025,  2.2109, -2.0371, -2.2910],
         [ 0.0385,  1.5801,  4.1992, -3.0312, -1.6807,  0.2301]],

        [[-0.3394,  2.6738, -1.1416,  1.9004,  2.1152, -4.2539],
         [ 3.0801, -0.2170, -2.1250, -1.8135,  2.0781, -1.1641],
         [ 3.0898,  1.8047, -2.8652,  5.3164,  2.3828, -1.9258]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [858]:
class positionalEncodingLayer(nn.Module):
    def __init__(self,max_seq_len,emb_dim,dropout):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        static_positional_info=torch.zeros((self.max_seq_len,self.emb_dim),dtype=torch.float16)
        positions=torch.arange(0,max_seq_len,dtype=torch.float16).reshape(max_seq_len,-1)
        indices_for_denominator=torch.arange(0,emb_dim,2,dtype=torch.float16) ### 2i
        denominators=torch.exp((-2*indices_for_denominator*math.log(1e4))/emb_dim)
        static_positional_info[:,0::2]=torch.sin(positions*denominators)
        static_positional_info[:,1::2]=torch.cos(positions*denominators)
        self.register_buffer('static_positional_info',static_positional_info)
    def forward(self,x):
        position_encoded_embedding=x+self.static_positional_info[:x.shape[1],:]
        dropped_embeddings=self.dropout(position_encoded_embedding)
        return dropped_embeddings


In [859]:
positional_encoding_layer=positionalEncodingLayer(3,6,0.3)

In [860]:
position_encoded_embedding=positional_encoding_layer(input_embeddings)
position_encoded_embedding

tensor([[[-0.7100,  6.7695,  4.8984, -0.0000,  0.0000,  3.4609],
         [ 0.0000, -1.0234, -0.0000,  4.5859, -2.9102, -0.0000],
         [ 1.3545,  1.6631,  0.0000, -0.0000, -2.4004,  1.7578]],

        [[-0.4849,  0.0000, -0.0000,  4.1445,  0.0000, -0.0000],
         [ 5.6016,  0.4622, -3.0332, -1.1621,  0.0000, -0.0000],
         [ 5.7148,  1.9844, -0.0000,  0.0000,  3.4043, -1.3223]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [861]:
class multiHeadAttentionBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,dropout):
        super().__init__()
        assert emb_dim%n_heads==0 ### checking if multi head splitting is possible.
        self.w_q=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_k=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_v=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_o=nn.Linear(emb_dim,emb_dim,dtype=torch.float16) ### multi-head-projection-layer
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        self.single_head_dim=self.emb_dim//n_heads
        self.n_heads=n_heads

    @staticmethod
    def contextual_embedding(m_q,m_k,m_v,per_head_emb_dim,mask):
        ### return contexual embedding and attention scores
        attention_scores=m_q@m_k.transpose(2,3)/math.sqrt(per_head_emb_dim)
        ##batch,head,seq,dim @ batch,head,dim,seq==batch,head,seq,seq
        if mask is not None:
            attention_scores.masked_fill_(mask==0,value=float('-inf'))
        normalized_attention_scores=torch.softmax(attention_scores,dim=-1)
        ### batch,head,seq,seq @ batch,head,seq,dim=batch,head,seq,dim
        contexual_embeddings=normalized_attention_scores@m_v
        return normalized_attention_scores,contexual_embeddings
    
    def forward(self,q,k,v,mask):
        query=self.w_q(q) ### batch, seqeunce, dim
        key=self.w_k(k)
        value=self.w_v(v)

        multihead_query=query.view(query.shape[0],query.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_key=key.view(key.shape[0],key.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_value=value.view(value.shape[0],value.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        _,contextual_embeddings=multiHeadAttentionBlock.contextual_embedding(multihead_query,multihead_key,multihead_value,self.single_head_dim,mask)
        final_contextual_embeddings=contextual_embeddings.transpose(1,2).contiguous().view(value.shape[0],value.shape[1],self.n_heads*self.single_head_dim)
        multihead_final_contextual_embedding_proj=self.w_o(final_contextual_embeddings)
        dropped_multihead_final_contextual_embedding_proj=self.dropout(multihead_final_contextual_embedding_proj)
        return dropped_multihead_final_contextual_embedding_proj

In [862]:
Mlab= multiHeadAttentionBlock(6,2,0.3)

In [863]:
a=lambda x: Mlab(x,x,x,None)
mha_out=a(position_encoded_embedding)
mha_out

tensor([[[-1.0244,  0.0000, -0.0044,  0.0000,  0.0000,  0.2698],
         [-1.9434,  0.7710, -0.7910,  0.0000,  0.0000,  0.0000],
         [-1.4121,  0.3752,  0.5352,  1.5469,  0.0000,  0.2651]],

        [[-0.0000, -0.0000,  1.0186,  0.7993,  1.9961,  0.0000],
         [-1.7383, -0.7485,  1.2539,  0.0000,  2.2695,  0.9678],
         [-0.0000, -0.6299,  1.0225,  0.9409,  0.0000,  0.9648]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [868]:
mha_out.shape

torch.Size([2, 3, 6])

In [869]:
class layerNormalizationBlock(nn.Module):
    def __init__(self,emb_dim,eps=1e-5):
        super().__init__()
        self.scale=nn.Parameter(torch.ones(emb_dim,dtype=torch.float16))
        self.shift=nn.Parameter(torch.zeros(emb_dim,dtype=torch.float16))
        self.eps=eps

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        standard_deviation=x.std(dim=-1,keepdim=True,unbiased=False)
        normalized_x=(x-mean)/(standard_deviation+self.eps)
        scale_n_shift=self.scale*normalized_x+self.shift
        return scale_n_shift

In [870]:
lnb=layerNormalizationBlock(6)

In [871]:
layer_normalized_out=lnb(mha_out)
layer_normalized_out

tensor([[[-2.1719,  0.3057,  0.2954,  0.3057,  0.3057,  0.9580],
         [-1.8965,  1.2881, -0.5444,  0.3840,  0.3840,  0.3840],
         [-1.8633,  0.1792,  0.3621,  1.5166, -0.2494,  0.0534]],

        [[-0.8657, -0.8657,  0.5215,  0.2228,  1.8525, -0.8657],
         [-1.5615, -0.8154,  0.6934, -0.2517,  1.4580,  0.4775],
         [-0.6084, -1.6094,  1.0166,  0.8857, -0.6084,  0.9248]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [872]:
class skipConnection(nn.Module):
    def __init__(self,dropout):
        super().__init__()
        self.dropout=nn.Dropout(dropout)

    def forward(self,x,sublayer):
        output=x+sublayer(x)
        dropped_output=self.dropout(output)
        return dropped_output

In [873]:
skipConnectionLayer=skipConnection(0.3)

In [874]:
skip_connections_output=skipConnectionLayer(position_encoded_embedding,a)
skip_connections_output

tensor([[[-2.4785, 10.6016,  6.9922,  0.3186,  0.2542,  0.0000],
         [ 0.0000, -0.3606, -1.1299,  7.6523, -0.0000,  1.9033],
         [-0.0000,  2.9141,  0.0000,  0.0000, -3.4297,  0.0000]],

        [[-0.0000, -0.0000,  0.0000,  0.0000,  2.8516,  0.0000],
         [ 0.0000, -0.0000, -2.5430, -0.1968,  0.0000,  1.3828],
         [ 0.0000,  1.9355,  1.4609,  0.0000,  7.9023, -1.8887]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [875]:
class feed_forward_block(nn.Module):
    ### Expansion Contraction layer.....
    def __init__(self,emb_dim,expand_dim,dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.expand_dim=expand_dim
        self.dropout=dropout
        self.network=nn.Sequential(
            nn.Linear(emb_dim,expand_dim,dtype=torch.float16),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(expand_dim,emb_dim,dtype=torch.float16),
        )
    def forward(self,x):
        output=self.network(x)
        return output

In [876]:
ffb=feed_forward_block(6,12,0.2)

In [877]:
ffb_output=ffb(skip_connections_output)
ffb_output

tensor([[[ 1.9121, -1.2197, -0.3979, -2.3574, -0.8320,  0.1094],
         [ 0.0396,  0.0464, -1.2646,  0.8452,  1.5117,  0.1829],
         [ 0.7300, -0.2017,  0.3567, -0.1979,  0.2361, -0.5698]],

        [[-0.0369,  0.0201, -0.1624, -0.1514,  0.2144,  0.1747],
         [ 0.4355,  0.5513, -0.8926, -0.3479,  0.4580,  0.4985],
         [ 0.4045,  0.6255, -0.1118, -0.5366, -0.0368,  0.4282]]],
       dtype=torch.float16, grad_fn=<ViewBackward0>)

In [878]:
class encoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout)
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(2)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(2)])
    def forward(self,x,mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block(x,x,x,mask))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,self.feed_forward_block)
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        return layer_normalized_output2

In [879]:
enc_blk=encoderBlock(6,2,0.3,12,0.3,0.3)
enc_blk

encoderBlock(
  (mha_block): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (feed_forward_block): feed_forward_block(
    (network): Sequential(
      (0): Linear(in_features=6, out_features=12, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=False)
      (3): Linear(in_features=12, out_features=6, bias=True)
    )
  )
  (skip_connections): ModuleList(
    (0-1): 2 x skipConnection(
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (layerNormalizationBlocks): ModuleList(
    (0-1): 2 x layerNormalizationBlock()
  )
)

In [880]:
enc_blk(ffb_output)

tensor([[[ 2.2090, -0.3611, -0.3611, -0.3611, -0.7798, -0.3452],
         [-0.6875, -0.2578, -0.8335,  0.3860,  2.0469, -0.6528],
         [-2.1445,  0.9966,  0.3716,  0.3716,  0.0321,  0.3716]],

        [[-0.0740,  0.0399, -1.1543, -0.7588,  2.0195, -0.0740],
         [ 0.2122,  0.1542, -2.1738,  0.8398,  0.5986,  0.3696],
         [ 0.4985, -0.2952, -1.8164,  0.1155, -0.0337,  1.5322]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [881]:
class encoder(nn.Module):
    def __init__(self,no_of_enc_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.enc_blks=nn.ModuleList([encoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_enc_blk)])
    def forward(self,x,mask=None):
        for blk in self.enc_blks:
            x=blk(x,mask)
        return x

In [882]:
enc=encoder(12,6,2,0.3,12,0.3,0.3)
enc

encoder(
  (enc_blks): ModuleList(
    (0-11): 12 x encoderBlock(
      (mha_block): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (feed_forward_block): feed_forward_block(
        (network): Sequential(
          (0): Linear(in_features=6, out_features=12, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.3, inplace=False)
          (3): Linear(in_features=12, out_features=6, bias=True)
        )
      )
      (skip_connections): ModuleList(
        (0-1): 2 x skipConnection(
          (dropout): Dropout(p=0.3, inplace=False)
        )
      )
      (layerNormalizationBlocks): ModuleList(
        (0-1): 2 x layerNormalizationBlock()
      )
    )
  )
)

In [883]:
enc_output=enc(ffb_output)
enc_output

tensor([[[-0.9395,  0.2659,  1.1143,  1.4326, -0.9336, -0.9395],
         [-0.3936, -0.1971,  0.0848,  2.0176, -1.2998, -0.2124],
         [-0.4470, -0.4470,  2.2344, -0.4470, -0.4470, -0.4470]],

        [[-0.1855,  0.4629, -2.1152,  0.6787,  0.3123,  0.8477],
         [-0.6445, -0.2800, -0.4331,  2.2227, -0.4331, -0.4331],
         [-0.2444, -0.2078, -0.4307, -0.7979, -0.5132,  2.1934]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [884]:
class decoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block1=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) ### casual attention block
        self.mha_block2=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) #### cross attention block
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(3)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(3)])
    def forward(self,x,enc_out,mask1=None,mask2=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block1(x,x,x,mask1))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,lambda x: self.mha_block2(x,enc_out,enc_out,mask2))
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        output3=self.skip_connections[2](layer_normalized_output2,self.feed_forward_block)
        layer_normalized_output3=self.layerNormalizationBlocks[2](output3)
        return layer_normalized_output3

In [885]:
dec_blk=decoderBlock(6,2,0.3,12,0.3,0.3)
dec_blk

decoderBlock(
  (mha_block1): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (mha_block2): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (feed_forward_block): feed_forward_block(
    (network): Sequential(
      (0): Linear(in_features=6, out_features=12, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=False)
      (3): Linear(in_features=12, out_features=6, bias=True)
    )
  )
  (skip_connections): ModuleList(
    (0-2): 3 x skipConnection(

In [887]:
dec_blk_out=dec_blk(ffb_output,ffb_output)
dec_blk_out

tensor([[[ 1.9668, -0.3589, -0.0656, -1.4111, -0.0656, -0.0656],
         [-0.4863, -1.2588,  1.2754, -0.4863,  1.4434, -0.4863],
         [-0.2219, -0.2219,  0.0309,  2.0176, -0.2803, -1.3242]],

        [[-0.7681, -0.8442, -0.1487, -0.1487,  2.1445, -0.2349],
         [ 0.8345, -1.0762, -1.5723,  0.2776,  0.3105,  1.2256],
         [ 0.5034,  0.5034,  0.5796,  0.1215, -2.2109,  0.5034]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [889]:
class decoder(nn.Module):
    def __init__(self,no_of_dec_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.dec_blks=nn.ModuleList([decoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_dec_blk)])
    def forward(self,x,encoder_output,decoder_mask,encoder_mask):
        for blk in self.dec_blks:
            x=blk(x,encoder_output,decoder_mask=None,encoder_mask=None)
        return x

In [890]:
dec=decoder(12,6,2,0.3,12,0.3,0.3)
dec

decoder(
  (dec_blks): ModuleList(
    (0-11): 12 x decoderBlock(
      (mha_block1): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (mha_block2): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (feed_forward_block): feed_forward_block(
        (network): Sequential(
          (0): Linear(in_features=6, out_features=12, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.3, inplace=False)
       

In [892]:
class finalProjectionLayer(nn.Module):
    def __init__(self,emb_dim,vocab_size):
        super().__init__()
        self.linear=nn.Linear(emb_dim,vocab_size,dtype=torch.float16)
    
    def forward(self,x):
        output=self.linear(x)
        return output
###batch,seq,vocab

In [893]:
class transformers(nn.Module):
    def __init__(self,model_config,tokenizer_config):
        super().__init__()
        self.encoder_emb_layer=inputEmbeddingLayer(tokenizer_config['vocab_size'],model_config['enc_cfg']['emb_dim'])
        self.enc_positional_emb_layer=positionalEncodingLayer(model_config['enc_max_seq_len'],model_config['enc_cfg']['emb_dim'],model_config['enc_cfg']['pos_emb_dropout'])
        self.encoder=encoder(
            no_of_enc_blk=model_config['enc_cfg']['no_of_enc_blk'],
            emb_dim=model_config['enc_cfg']['emb_dim'],
            n_heads=model_config['enc_cfg']['n_heads'],
            mha_dropout=model_config['enc_cfg']['mha_dropout'],
            expand_dim=model_config['enc_cfg']['expand_dim'],
            ff_dropout=model_config['enc_cfg']['ff_dropout'],
            sk_dropout=model_config['enc_cfg']['sk_dropout']
        )
        
        self.decoder_emb_layer=inputEmbeddingLayer(tokenizer_config['vocab_size'],model_config['dec_cfg']['emb_dim'])
        self.dec_positional_emb_layer=positionalEncodingLayer(model_config['dec_max_seq_len'],model_config['dec_cfg']['emb_dim'],model_config['dec_cfg']['pos_emb_dropout'])
        self.decoder=decoder(
            no_of_dec_blk=model_config['dec_cfg']['no_of_dec_blk'],
            emb_dim=model_config['dec_cfg']['emb_dim'],
            n_heads=model_config['dec_cfg']['n_heads'],
            mha_dropout=model_config['dec_cfg']['mha_dropout'],
            expand_dim=model_config['dec_cfg']['expand_dim'],
            ff_dropout=model_config['dec_cfg']['ff_dropout'],
            sk_dropout=model_config['dec_cfg']['sk_dropout']
        )
        self.decoder_final_projection=finalProjectionLayer(model_config['dec_cfg']['emb_dim'],tokenizer_config['vocab_size'])

    def encode(self,x,mask=None):
        encoder_input_embedding=self.encoder_emb_layer(x)
        positional_encoded_input_embedding=self.enc_positional_emb_layer(encoder_input_embedding)
        encoder_contexual_embedding=self.encoder(positional_encoded_input_embedding,mask)
        return encoder_contexual_embedding

    def decode(self,x,encoder_output,mask1=None,mask2=None):
        decoder_input_embedding=self.decoder_emb_layer(x)
        positional_encoded_input_embedding=self.dec_positional_emb_layer(decoder_input_embedding)
        decoder_contexual_embedding=self.decoder(positional_encoded_input_embedding,encoder_output,mask1,mask2)
        final_output=self.decoder_final_projection(decoder_contexual_embedding)
        return final_output
    
    ###forward will be used during training.
    def forward(self,encoder_input,decoder_input,src_mask,tgt_mask):
        encoder_output=self.encode(encoder_input,src_mask)
        decoder_output=self.decode(decoder_input,encoder_output,src_mask,tgt_mask)
        return decoder_output

In [894]:
# Model configuration dictionary
model_config = {
    "enc_max_seq_len": 484,   # Max source sequence length
    "dec_max_seq_len": 484,   # Max target sequence length
    "enc_cfg": {
        "emb_dim": 512,
        "no_of_enc_blk": 6,
        "n_heads": 8,
        "pos_emb_dropout":0.1,
        "mha_dropout": 0.1,
        "expand_dim": 2048,
        "ff_dropout": 0.1,
        "sk_dropout": 0.1
    },
    "dec_cfg": {
        "emb_dim": 512,
        "no_of_dec_blk": 6,
        "pos_emb_dropout":0.1,
        "n_heads": 8,
        "mha_dropout": 0.1,
        "expand_dim": 2048,
        "ff_dropout": 0.1,
        "sk_dropout": 0.1
    }
}

# Tokenizer configuration dictionary
tokenizer_config = {
    "vocab_size": 30000,  # Vocabulary size of source & target tokenizer

}

In [895]:
import json

# Save
with open("model_config.json", "w") as f:
    json.dump(model_config, f, indent=4)

with open("tokenizer_config.json", "w") as f:
    json.dump(tokenizer_config, f, indent=4)

# Load
with open("model_config.json", "r") as f:
    model_config = json.load(f)

with open("tokenizer_config.json", "r") as f:
    tokenizer_config = json.load(f)


In [897]:
Transformer=transformers(model_config,tokenizer_config)
Transformer

transformers(
  (encoder_emb_layer): inputEmbeddingLayer(
    (embedding_layer): Embedding(30000, 512)
  )
  (enc_positional_emb_layer): positionalEncodingLayer(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): encoder(
    (enc_blks): ModuleList(
      (0-5): 6 x encoderBlock(
        (mha_block): multiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_o): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): feed_forward_block(
          (network): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
   

In [898]:
import torch
def casual_mask_generator(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

### Inference Strategy

In [899]:
@torch.no_grad
def gready_decode(model,src,src_mask,tokenizer_src,tokenizer_tgt,max_len,device):
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    encoder_output=model.encode(src,src_mask)
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(src).to(device)

    while True:
        if decoder_input.size(1)==max_len:
            break

        decoder_mask=casual_mask_generator(decoder_input.size(1)).type_as(src_mask).to(device)

        out=model.decode(decoder_input,encoder_output,src_mask,decoder_mask)

        prob=model.project(out[:,-1])

        _,next_word=torch.max(prob,dim=1)

        decoder_input=torch.cat((decoder_input,[next_word]),dim=1).type_as(src).to(device)

        if next_word==eos_idx:
            break

    return decoder_input.squeeze(0)

In [900]:
import tqdm

### Train

In [901]:
def train():
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")

    optimizer=torch.optim.Adam(Transformer.parameters(), lr=1e-4, eps = 1e-9)
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_en.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)
    initial_epoch = 0
    global_step = 0

    epoch=3

    for i in range(epoch):
        for batch in train_loader:
            encoder_input=batch['encoder_input'].to(device)
            decoder_input=batch['decoder_input'].to(device)
            target_output=batch['target_output'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)
            decoder_mask=batch['decodar_mask'].to(device)
            src_sentence=batch['src_sentance']
            tgt_sentence=batch['tgt_sentence']

            encoder_output=Transformer.encode(encoder_input,encoder_mask)
            decoder_output=Transformer.decode(decoder_input,encoder_output,decoder_mask,encoder_mask)
            proj_output=Transformer.decoder_final_projection(decoder_output)
            loss = loss_fn(proj_output.view(-1, tokenizer_fr.get_vocab_size()), label.view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step+=1
            print('ko')

            



    

In [902]:
train()

Using device cpu


TypeError: decoderBlock.forward() got an unexpected keyword argument 'decoder_mask'