In [2]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
class InputEmbedding(nn.Module):

    def __init__(self,d_model: int, vocab_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)


    def forward(self,x):
        return self.embedding(x) * np.sqrt(self.d_model)


In [None]:
inp = InputEmbedding(3,5)

In [None]:
input_indices = torch.tensor([1, 4])

output = inp(input_indices)
print("Output Embedding: \n", output)

Output Embedding: 
 tensor([[-2.1684, -0.9900,  0.2864],
        [ 0.1355, -0.8526,  2.6743]], grad_fn=<MulBackward0>)


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0,seq_len, dtype=torch.float).unsqueeze(1)
        deno = torch.exp(torch.arange(0,d_model,2).float() * (-np.log(10000.0) /  d_model))


In [None]:
position = torch.arange(0,5, dtype=torch.float)
position

tensor([0., 1., 2., 3., 4.])

In [None]:
position.unsqueeze(1)

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 10**-6) -> float:
        super().__init()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self,x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias



In [3]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().init()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)


    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [22]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(h * self.d_k , d_model)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, values, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        attention_scores = (query @ key.transpose(-2, -1)) / np.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ values), attention_scores



    def forward(self, q,k,v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        values = self.w_v(v)

        query = query.view(query.shape[0],query.shape[1], self.h, self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1], self.h, self.d_k).transpose(1,2)
        values = values.view(values.shape[0], values.shape[1], self.h, self.d_k).transpose(1,2)


        x, self.attention_scores = MultiHeadAttentionBlock(query, key, values, self.dropout)

        x = x.transpose(1,2).contiguous.view(x.shape[0], x.shape[1],self.h * self.d_k)

        return self.w_o(x)



In [None]:
class ResidualConnection(nn.Module):
    def __init__(self,dropout : float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        # return x + self.dropout(sublayer(self.norm(x)))


In [23]:
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.dropout = nn.Dropout(dropout)
        self.residual_connections = nn.ModuleList((ResidualConnection(dropout) for _ in range(2)))

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)

        return x
