In [96]:
import torch
import torch.nn as nn
import math

In [97]:
class inputEmbeddingLayer(nn.Module):
    def __init__(self,vocab_size,emb_dim):
        super().__init__()
        self.vocab_size=vocab_size
        self.emb_dim=emb_dim
        self.embedding_layer=nn.Embedding(self.vocab_size,self.emb_dim,dtype=torch.float16)
    def forward(self,x):
        embeddings=self.embedding_layer(x)
        return embeddings*math.sqrt(self.emb_dim)

In [98]:
embedding_layer=inputEmbeddingLayer(10,6)

In [99]:
test_input=torch.tensor([[1,2,3],[4,5,6]])
input_embeddings=embedding_layer(test_input)
input_embeddings

tensor([[[-3.7285, -0.6577,  1.6963, -0.4160, -0.3901,  0.7314],
         [ 3.3672,  3.5195, -8.0234, -2.4414, -0.8682, -0.6167],
         [-4.2539, -2.8965, -4.1680,  1.3740,  0.8804,  0.2313]],

        [[-0.3079, -0.8960, -5.2422, -1.0273, -2.6914,  1.0000],
         [-0.6484,  1.1846, -2.8105, -1.2402, -0.4946,  1.7822],
         [ 5.4766,  2.6523,  7.1680,  1.9941,  0.6685,  0.2140]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [100]:
class positionalEncodingLayer(nn.Module):
    def __init__(self,max_seq_len,emb_dim,dropout):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        static_positional_info=torch.zeros((self.max_seq_len,self.emb_dim),dtype=torch.float16)
        positions=torch.arange(0,max_seq_len,dtype=torch.float16).reshape(max_seq_len,-1)
        indices_for_denominator=torch.arange(0,emb_dim,2,dtype=torch.float16) ### 2i
        denominators=torch.exp((-2*indices_for_denominator*math.log(1e4))/emb_dim)
        static_positional_info[:,0::2]=torch.sin(positions*denominators)
        static_positional_info[:,1::2]=torch.cos(positions*denominators)
        self.register_buffer('static_positional_info',static_positional_info)
    def forward(self,x):
        position_encoded_embedding=x+self.static_positional_info[:x.shape[1],:]
        dropped_embeddings=self.dropout(position_encoded_embedding)
        return dropped_embeddings


In [101]:
positional_encoding_layer=positionalEncodingLayer(3,6,0.3)

In [102]:
position_encoded_embedding=positional_encoding_layer(input_embeddings)
position_encoded_embedding

tensor([[[ -5.3281,   0.4890,   0.0000,   0.8345,  -0.0000,   0.0000],
         [  6.0117,   5.7969, -11.4609,  -0.0000,  -0.0000,   0.0000],
         [ -0.0000,  -0.0000,  -5.9492,   0.0000,   0.0000,   1.7598]],

        [[ -0.0000,   0.0000,  -7.4883,  -0.0391,  -3.8457,   2.8574],
         [  0.0000,   2.4648,  -0.0000,  -0.0000,  -0.7065,   3.9727],
         [  9.1250,   3.1953,   0.0000,   4.2773,   0.0000,   1.7344]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [103]:
class multiHeadAttentionBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,dropout):
        super().__init__()
        assert emb_dim%n_heads==0 ### checking if multi head splitting is possible.
        self.w_q=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_k=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_v=nn.Linear(emb_dim,emb_dim,dtype=torch.float16)
        self.w_o=nn.Linear(emb_dim,emb_dim,dtype=torch.float16) ### multi-head-projection-layer
        self.emb_dim=emb_dim
        self.dropout=nn.Dropout(dropout)
        self.single_head_dim=self.emb_dim//n_heads
        self.n_heads=n_heads

    @staticmethod
    def contextual_embedding(m_q,m_k,m_v,per_head_emb_dim,mask):
        ### return contexual embedding and attention scores
        attention_scores=m_q@m_k.transpose(2,3)/math.sqrt(per_head_emb_dim)
        ##batch,head,seq,dim @ batch,head,dim,seq==batch,head,seq,seq
        if mask is not None:
            attention_scores.masked_fill_(mask,value=float('-inf'))
        normalized_attention_scores=torch.softmax(attention_scores,dim=-1)
        ### batch,head,seq,seq @ batch,head,seq,dim=batch,head,seq,dim
        contexual_embeddings=normalized_attention_scores@m_v
        return normalized_attention_scores,contexual_embeddings
    
    def forward(self,q,k,v,mask):
        query=self.w_q(q) ### batch, seqeunce, dim
        key=self.w_k(k)
        value=self.w_v(v)

        multihead_query=query.view(query.shape[0],query.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_key=key.view(key.shape[0],key.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        multihead_value=value.view(value.shape[0],value.shape[1],self.n_heads,self.single_head_dim).transpose(1,2)
        _,contextual_embeddings=multiHeadAttentionBlock.contextual_embedding(multihead_query,multihead_key,multihead_value,self.single_head_dim,mask)
        final_contextual_embeddings=contextual_embeddings.transpose(1,2).contiguous().view(value.shape[0],value.shape[1],self.n_heads*self.single_head_dim)
        multihead_final_contextual_embedding_proj=self.w_o(final_contextual_embeddings)
        dropped_multihead_final_contextual_embedding_proj=self.dropout(multihead_final_contextual_embedding_proj)
        return dropped_multihead_final_contextual_embedding_proj

In [104]:
Mlab= multiHeadAttentionBlock(6,2,0.3)

In [105]:
a=lambda x: Mlab(x,x,x,None)
mha_out=a(position_encoded_embedding)
mha_out

tensor([[[-2.4980e+00, -1.6797e-01,  7.0264e-01,  3.2734e+00, -3.0156e+00,
           1.5723e+00],
         [ 3.6279e-01,  6.8555e-01,  1.4868e-01,  0.0000e+00, -1.5420e+00,
           1.8887e+00],
         [ 2.2095e-01,  7.1387e-01, -1.4087e-01,  6.7383e-02, -9.2285e-01,
           1.3086e+00]],

        [[ 5.4395e-01,  4.7998e-01,  1.5625e+00,  0.0000e+00, -0.0000e+00,
           2.2129e+00],
         [-1.4048e-03,  4.1260e-01,  1.1982e+00,  1.6797e+00, -3.3379e+00,
           2.0293e+00],
         [ 7.5391e-01,  3.4570e-01,  1.6953e+00,  2.3438e-01, -2.6816e+00,
           2.1367e+00]]], dtype=torch.float16, grad_fn=<MulBackward0>)

In [106]:
class layerNormalizationBlock(nn.Module):
    def __init__(self,emb_dim,eps=1e-5):
        super().__init__()
        self.scale=nn.Parameter(torch.ones(emb_dim,dtype=torch.float16))
        self.shift=nn.Parameter(torch.zeros(emb_dim,dtype=torch.float16))
        self.eps=eps

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        standard_deviation=x.std(dim=-1,keepdim=True,unbiased=False)
        normalized_x=(x-mean)/(standard_deviation+self.eps)
        scale_n_shift=self.scale*normalized_x+self.shift
        return scale_n_shift

In [107]:
lnb=layerNormalizationBlock(6)

In [108]:
layer_normalized_out=lnb(mha_out)
layer_normalized_out

tensor([[[-1.1260, -0.0663,  0.3296,  1.4980, -1.3613,  0.7251],
         [ 0.1039,  0.4221, -0.1071, -0.2537, -1.7725,  1.6074],
         [ 0.0188,  0.7290, -0.5024, -0.2025, -1.6299,  1.5859]],

        [[-0.3123, -0.3904,  0.9307, -0.9761, -0.9761,  1.7246],
         [-0.1862,  0.0463,  0.4873,  0.7578, -2.0586,  0.9541],
         [ 0.2198, -0.0442,  0.8286, -0.1162, -2.0020,  1.1143]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [109]:
class skipConnection(nn.Module):
    def __init__(self,dropout):
        super().__init__()
        self.dropout=nn.Dropout(dropout)

    def forward(self,x,sublayer):
        output=x+sublayer(x)
        dropped_output=self.dropout(output)
        return dropped_output

In [110]:
skipConnectionLayer=skipConnection(0.3)

In [111]:
skip_connections_output=skipConnectionLayer(position_encoded_embedding,a)
skip_connections_output

tensor([[[-0.0000,  0.4587,  0.0000,  5.8711, -4.3086,  0.0000],
         [ 9.1094,  9.2656, -0.0000,  0.0000, -0.0000,  2.6992],
         [ 0.3157,  1.0195, -8.7031,  0.0963, -0.0000,  4.3828]],

        [[ 0.0000,  0.0000, -0.0000, -0.0558, -0.0000,  7.2422],
         [ 0.0000,  3.5215,  0.0000,  0.0000, -5.7773,  8.5703],
         [ 0.0000,  4.5664,  0.0000,  6.4453, -0.0000,  5.5312]]],
       dtype=torch.float16, grad_fn=<MulBackward0>)

In [112]:
class feed_forward_block(nn.Module):
    ### Expansion Contraction layer.....
    def __init__(self,emb_dim,expand_dim,dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.expand_dim=expand_dim
        self.dropout=dropout
        self.network=nn.Sequential(
            nn.Linear(emb_dim,expand_dim,dtype=torch.float16),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(expand_dim,emb_dim,dtype=torch.float16),
        )
    def forward(self,x):
        output=self.network(x)
        return output

In [113]:
ffb=feed_forward_block(6,12,0.2)

In [114]:
ffb_output=ffb(skip_connections_output)
ffb_output

tensor([[[-0.1298,  0.8462,  1.4238, -0.2001, -1.5342, -2.2734],
         [-0.6792, -1.0068,  1.0400,  0.0239, -1.4814, -1.7031],
         [ 0.6167,  0.8081, -0.4392, -0.6909,  0.4221,  0.7646]],

        [[-0.1832,  0.4194,  0.3806, -0.4565,  0.8589,  0.1782],
         [ 0.3315,  1.0537, -0.2401, -0.8760,  0.7412,  0.7852],
         [-0.5298, -0.3499,  0.5244,  0.3955, -1.9307, -1.4697]]],
       dtype=torch.float16, grad_fn=<ViewBackward0>)

In [115]:
class encoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout)
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(2)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(2)])
    def forward(self,x,mask=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block(x,x,x,mask))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,self.feed_forward_block)
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        return layer_normalized_output2

In [116]:
enc_blk=encoderBlock(6,2,0.3,12,0.3,0.3)
enc_blk

encoderBlock(
  (mha_block): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (feed_forward_block): feed_forward_block(
    (network): Sequential(
      (0): Linear(in_features=6, out_features=12, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=False)
      (3): Linear(in_features=12, out_features=6, bias=True)
    )
  )
  (skip_connections): ModuleList(
    (0-1): 2 x skipConnection(
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (layerNormalizationBlocks): ModuleList(
    (0-1): 2 x layerNormalizationBlock()
  )
)

In [117]:
enc_blk(ffb_output)

tensor([[[ 0.3645,  0.2683,  1.3809,  0.5864, -1.5908, -1.0088],
         [-0.1202, -0.6440,  0.4849,  1.6025,  0.3086, -1.6338],
         [ 0.7104,  0.3320, -1.2617, -1.4355,  0.3979,  1.2559]],

        [[-0.9526,  0.1586,  0.7988, -1.2871, -0.3452,  1.6279],
         [ 0.1030,  0.1030, -0.6357, -1.4971,  0.1030,  1.8242],
         [ 0.0309, -1.2832,  0.5620,  1.7129, -1.0537,  0.0309]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [118]:
class encoder(nn.Module):
    def __init__(self,no_of_enc_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.enc_blks=nn.ModuleList([encoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_enc_blk)])
    def forward(self,x,mask=None):
        for blk in self.enc_blks:
            x=blk(x,mask)
        return x

In [119]:
enc=encoder(12,6,2,0.3,12,0.3,0.3)
enc

encoder(
  (enc_blks): ModuleList(
    (0-11): 12 x encoderBlock(
      (mha_block): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (feed_forward_block): feed_forward_block(
        (network): Sequential(
          (0): Linear(in_features=6, out_features=12, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.3, inplace=False)
          (3): Linear(in_features=12, out_features=6, bias=True)
        )
      )
      (skip_connections): ModuleList(
        (0-1): 2 x skipConnection(
          (dropout): Dropout(p=0.3, inplace=False)
        )
      )
      (layerNormalizationBlocks): ModuleList(
        (0-1): 2 x layerNormalizationBlock()
      )
    )
  )
)

In [120]:
enc_output=enc(ffb_output)
enc_output

tensor([[[-0.3179, -1.1045, -0.2343, -0.2343,  2.1250, -0.2343],
         [ 0.0470, -0.1120,  0.0470, -1.9609,  0.6914,  1.2881],
         [-1.9170,  0.2368,  0.4348, -0.0729,  1.4346, -0.1168]],

        [[-0.2869,  1.9961,  0.0468, -0.3589, -0.0558, -1.3408],
         [-0.9761, -0.6025,  0.8018,  1.7236,  0.0816, -1.0303],
         [ 1.6953, -1.1719, -0.7769, -0.7769,  0.4128,  0.6157]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [125]:
class decoderBlock(nn.Module):
    def __init__(self,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.n_heads=n_heads
        self.ff_dropout=ff_dropout
        self.expand_dim=expand_dim
        self.mha_dropout=mha_dropout
        self.sk_dropout=sk_dropout
        self.mha_block1=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) ### casual attention block
        self.mha_block2=multiHeadAttentionBlock(self.emb_dim,self.n_heads,self.mha_dropout) #### cross attention block
        self.feed_forward_block=feed_forward_block(self.emb_dim,self.expand_dim,self.ff_dropout)
        self.skip_connections=nn.ModuleList([skipConnection(self.sk_dropout) for _ in range(3)])
        self.layerNormalizationBlocks=nn.ModuleList([layerNormalizationBlock(self.emb_dim) for _ in range(3)])
    def forward(self,x,enc_out,mask1=None,mask2=None):
        output1=self.skip_connections[0](x,lambda x: self.mha_block1(x,x,x,mask1))
        layer_normalized_output1=self.layerNormalizationBlocks[0](output1)
        output2=self.skip_connections[1](layer_normalized_output1,lambda x: self.mha_block2(x,enc_out,enc_out,mask2))
        layer_normalized_output2=self.layerNormalizationBlocks[1](output2)
        output3=self.skip_connections[2](layer_normalized_output2,self.feed_forward_block)
        layer_normalized_output3=self.layerNormalizationBlocks[2](output3)
        return layer_normalized_output3

In [126]:
dec_blk=decoderBlock(6,2,0.3,12,0.3,0.3)
dec_blk

decoderBlock(
  (mha_block1): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (mha_block2): multiHeadAttentionBlock(
    (w_q): Linear(in_features=6, out_features=6, bias=True)
    (w_k): Linear(in_features=6, out_features=6, bias=True)
    (w_v): Linear(in_features=6, out_features=6, bias=True)
    (w_o): Linear(in_features=6, out_features=6, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (feed_forward_block): feed_forward_block(
    (network): Sequential(
      (0): Linear(in_features=6, out_features=12, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=False)
      (3): Linear(in_features=12, out_features=6, bias=True)
    )
  )
  (skip_connections): ModuleList(
    (0-2): 3 x skipConnection(

In [128]:
dec_blk_out=dec_blk(ffb_output,ffb_output)
dec_blk_out

tensor([[[-0.7739,  1.6338,  0.2025,  0.3301,  0.2025, -1.5947],
         [-0.8677,  0.5093,  0.3181,  0.3696,  1.3672, -1.6973],
         [ 0.0753,  1.7529, -0.3643, -1.5625,  0.4619, -0.3633]],

        [[-0.2925,  0.1620, -1.7334,  0.1620,  0.0135,  1.6895],
         [-0.8613,  0.8999,  0.0722, -1.5146, -0.0594,  1.4648],
         [-0.4255, -0.5322, -0.4255, -0.4255,  2.2344, -0.4255]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [129]:
class decoder(nn.Module):
    def __init__(self,no_of_enc_blk,emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout):
        super().__init__()
        self.dec_blks=nn.ModuleList([decoderBlock(emb_dim,n_heads,mha_dropout,expand_dim,ff_dropout,sk_dropout) for _ in range(no_of_enc_blk)])
    def forward(self,x,mask=None):
        for blk in self.dec_blks:
            x=blk(x,mask)
        return x

In [130]:
dec=decoder(12,6,2,0.3,12,0.3,0.3)
dec

decoder(
  (dec_blks): ModuleList(
    (0-11): 12 x decoderBlock(
      (mha_block1): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (mha_block2): multiHeadAttentionBlock(
        (w_q): Linear(in_features=6, out_features=6, bias=True)
        (w_k): Linear(in_features=6, out_features=6, bias=True)
        (w_v): Linear(in_features=6, out_features=6, bias=True)
        (w_o): Linear(in_features=6, out_features=6, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (feed_forward_block): feed_forward_block(
        (network): Sequential(
          (0): Linear(in_features=6, out_features=12, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.3, inplace=False)
       

In [131]:
dec_output=dec(ffb_output,ffb_output)
dec_output

tensor([[[ 0.6748, -0.4148,  0.6748,  0.4541,  0.6748, -2.0625],
         [ 0.6167,  0.9536,  0.7568, -1.9814,  0.1019, -0.4482],
         [ 1.6670,  0.7524, -0.4072, -1.5361, -0.3074, -0.1692]],

        [[ 0.3938,  0.9111, -1.3838,  1.0459,  0.3938, -1.3623],
         [ 0.9048,  0.3977, -1.8027,  0.4558, -0.8594,  0.9048],
         [ 0.2045, -0.8579, -1.1289,  0.6162,  1.7852, -0.6191]]],
       dtype=torch.float16, grad_fn=<AddBackward0>)

In [132]:
class finalProjectionLayer(nn.Module):
    def __init__(self,emb_dim,vocab_size):
        super().__init__()
        self.linear=nn.Linear(emb_dim,vocab_size)
    
    def forward(self,x):
        output=self.linear(x)
        return output

In [None]:
class transformers(nn.Module):
    def __init__(self):
        self.encoder=
        