In [27]:
# ---------------------------------
# importing Libraries
# ---------------------------------
import torch 
import torch.nn as nn 
import math 

In [30]:
# ---------------------------------
# Input Embeddings
# ---------------------------------

class InputEnbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size:int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, d_model)

    def forwad(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the the paper
        return self.embeddings(x) * math.sqrt(self.d_model)

In [31]:
a = nn.Embedding(10, 5); b = torch.randint(low=1, high=10, size=(2,5))
b, a.weight, a(b)

(tensor([[1, 9, 8, 1, 6],
         [2, 3, 3, 7, 6]]),
 Parameter containing:
 tensor([[-1.2780,  0.8751, -0.2351, -1.2761,  0.5549],
         [ 0.1548,  0.1161,  0.9773,  1.2093, -0.3735],
         [-0.2943, -2.1374, -0.1426, -0.6374,  0.8016],
         [-2.4155,  0.0477, -2.1817,  0.6429, -0.0706],
         [-0.1482,  0.0647, -0.0669, -0.3310, -0.7109],
         [ 0.2538,  1.6435,  0.5717, -0.1346, -0.7974],
         [ 0.8568, -0.6566,  1.0557, -1.2519,  0.2087],
         [ 1.2182, -0.4036, -0.4436,  0.5534, -1.1589],
         [-1.4049, -2.0428,  3.1682, -0.5440,  1.0847],
         [-1.0299,  0.8130, -0.3291,  1.2895,  0.8362]], requires_grad=True),
 tensor([[[ 0.1548,  0.1161,  0.9773,  1.2093, -0.3735],
          [-1.0299,  0.8130, -0.3291,  1.2895,  0.8362],
          [-1.4049, -2.0428,  3.1682, -0.5440,  1.0847],
          [ 0.1548,  0.1161,  0.9773,  1.2093, -0.3735],
          [ 0.8568, -0.6566,  1.0557, -1.2519,  0.2087]],
 
         [[-0.2943, -2.1374, -0.1426, -0.6374,  0.801

In [33]:
# ---------------------------------
# Positional Embeddings
# ---------------------------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = dropout

        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) #(d_model / 2)
        # apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position / 10000**(2i / d_model))
        # apply cosine to off indices 
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position / 10000**(2i / d_model))
        # Add a batch dimension to the positional encoding 
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [35]:
a(b)

tensor([[[ 0.1548,  0.1161,  0.9773,  1.2093, -0.3735],
         [-1.0299,  0.8130, -0.3291,  1.2895,  0.8362],
         [-1.4049, -2.0428,  3.1682, -0.5440,  1.0847],
         [ 0.1548,  0.1161,  0.9773,  1.2093, -0.3735],
         [ 0.8568, -0.6566,  1.0557, -1.2519,  0.2087]],

        [[-0.2943, -2.1374, -0.1426, -0.6374,  0.8016],
         [-2.4155,  0.0477, -2.1817,  0.6429, -0.0706],
         [-2.4155,  0.0477, -2.1817,  0.6429, -0.0706],
         [ 1.2182, -0.4036, -0.4436,  0.5534, -1.1589],
         [ 0.8568, -0.6566,  1.0557, -1.2519,  0.2087]]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
# ---------------------------------
# MultiHead Attention
# ---------------------------------

In [None]:
# ---------------------------------
# Encoder Block
# ---------------------------------



In [None]:
# ---------------------------------
# Residual Connection
# ---------------------------------

        

In [5]:
import torch 

a = torch.randint(low=1, high=10, size=(3,3)); b = torch.randint(low=1, high=10, size=(3,1))
a, b

(tensor([[2, 7, 1],
         [6, 3, 5],
         [1, 9, 3]]),
 tensor([[2],
         [9],
         [6]]))

In [32]:
import torch 
import math 

d_model = 64
position = torch.arange(0, 10).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) #(d_model / 2)
div_term

tensor([1.0000e+00, 7.4989e-01, 5.6234e-01, 4.2170e-01, 3.1623e-01, 2.3714e-01,
        1.7783e-01, 1.3335e-01, 1.0000e-01, 7.4989e-02, 5.6234e-02, 4.2170e-02,
        3.1623e-02, 2.3714e-02, 1.7783e-02, 1.3335e-02, 1.0000e-02, 7.4989e-03,
        5.6234e-03, 4.2170e-03, 3.1623e-03, 2.3714e-03, 1.7783e-03, 1.3335e-03,
        1.0000e-03, 7.4989e-04, 5.6234e-04, 4.2170e-04, 3.1623e-04, 2.3714e-04,
        1.7783e-04, 1.3335e-04])

In [34]:
position * div_term

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.0000e+00, 7.4989e-01, 5.6234e-01, 4.2170e-01, 3.1623e-01, 2.3714e-01,
         1.7783e-01, 1.3335e-01, 1.0000e-01, 7.4989e-02, 5.6234e-02, 4.2170e-02,
         3.1623e-02, 2.3714e-02, 1.7783e-02, 1.3335e-02, 1.0000e-02, 7.4989e-03,
         5.6234e-03, 4.2170e-03, 3.1623e-03, 2.3714e-03, 1.7783e-03, 1.3335e-03,
         1.0000e-03, 7.4989e-04, 5.6234e-04, 4.2170e-04, 3.1623e-04, 2.3714e-04,
         1.7783e-04, 1.3335e-04],
        [2.0000e+00, 1.4998e+00, 1.1247e+00, 8.4339e-01, 6.3246e-01, 4.7427e-01,
         3.5566e-01, 2.6670e-01, 2.0000e-