In [1]:
import numpy as np

import torch
import torch.nn as nn
import math
from torch.autograd import Variable

import torch.nn.functional as F

# Embedding 

In [54]:
class Embedder(nn.Module):
    def __init__(self, vocab_size:int, emb_dim:int):
        super().__init__()
        self.emb_dim = emb_dim 
        self.embed = nn.Embedding(vocab_size, emb_dim)

    def forward(self, x: torch.tensor):
        """
        input:
            x: size 
                bs * seq_len
        output:
            size:
                bs * seq_len * emb_dim
                
        Note: emb_dim == d_model
        """
        return self.embed(x)

# Why use `register_buffer`

- Ans [link](https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723/11)

An example where I find this distinction difficult is in 
the context of fixed positional encodings in the Transformer 
model. Typically I see implementations where the fixed positional 
encodings are registered as buffers but I’d consider these tensors 
as non-learnable parameters (that should show up in the list of 
model parameters), especially when comparing between methods 
that don’t rely on such injection of fixed tensors.

So in general:
- buffers = `fixed tensors / non-learnable parameters / stuff that does not require gradient`
- parameters = `learnable parameters, requires gradient`

![image](https://discuss.pytorch.org/user_avatar/discuss.pytorch.org/ptrblck/90/1823_2.png)
Piotr Bialecki

If you have parameters in your model, which should be saved and restored in the state_dict, but not trained by the optimizer, you should register them as buffers. Buffers won’t be returned in model.parameters(), so that the optimizer won’t have a change to update them.

Both approaches work the same regarding training etc.
There are some differences in the function calls however. Using register_parameter you have to pass the name as a string, which can make the creation of a range of parameters convenient. Besides that I think it’s just coding style which one you prefer.

If your `self.some_params` are `nn.Parameter` objects, then you don’t have to worry about this. If they’re tensors, then they won’t be in the `state_dict` (unless registered as buffer).

> simple `torch.tensor` will not be available under `state_dict`

one reason to register the tensor as a buffer is to be able to serialize the model and restore all internal states.
Another one is that all buffers and parameters will be pushed to the device, if called on the parent model:

```python
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.my_tensor = torch.randn(1)
        self.register_buffer('my_buffer', torch.randn(1))
        self.my_param = nn.Parameter(torch.randn(1))
        
    def forward(self, x):
            return x

model = MyModel()
print(model.my_tensor)
> tensor([0.9329])
print(model.state_dict())
> OrderedDict([('my_param', tensor([-0.2471])), ('my_buffer', tensor([1.2112]))])

model.cuda()
print(model.my_tensor)
> tensor([0.9329])
print(model.state_dict())
> OrderedDict([('my_param', tensor([-0.2471], device='cuda:0')), ('my_buffer', tensor([1.2112], device='cuda:0'))])
```

As you can see, model.my_tensor is still on the CPU, where is was created, while all parameters and buffers were pushed to the GPU after calling `model.cuda()`.

<center>
    <img src="https://miro.medium.com/max/566/1*B-VR6R5vJl3Y7jbMNf5Fpw.png" width="400">
</center>

# Make embedding relatively larger by scaling the values. WHY?

The reason we increase the embedding values before 
addition is to make the positional encoding relatively 
smaller. This means the original meaning in the embedding 
vector won’t be lost when we add them together

```python
x = x*math.sqrt(self.emb_dim)
```

In [68]:
class PositionalEmbedding(nn.Module):
    def __init__(self, emb_dim:int, max_seq_len:int = 200, dropout_pct:float = 0.1):
        super().__init__()

        self.emb_dim = emb_dim
        self.dropout = nn.Dropout(dropout_pct)

        # create constant 'pe' matrix with 
        # size max_seq_len * emb_dim
        # values dependent on 
        # word position 'pos' and embedding position 'i'
        pe = torch.zeros(max_seq_len, emb_dim)

        for pos in range(max_seq_len):
            for i in range(0, emb_dim, 2):
                pe[pos,i] = math.sin(pos/(1000**((2*i)/emb_dim)))
                pe[pos,i+1] = math.sin(pos/(1000**((2*(i+1))/emb_dim)))

        
        # Adding one extra dimension at the beginning to 
        # make the size 
        #              1 * max_seq_len * emb_dim
        # later while adding to x, 1st dimension of pe will
        # be braodcast to match the batch_size of x
        pe = pe.unsqueeze(0)
        #print(pe.size())

        self.register_buffer('pe', pe)


    def forward(self, x):
        """
        input:
            x: size 
                bs * seq_len * d_model
                
        output:
            size: bs * seq_len * d_model
        """
        
        # scale values
        x = x*math.sqrt(self.emb_dim)  

        seq_len = x.size(1)
        
        # add constant positional embedding to the word embedding
        
        
        # truncate max_seq_len to the actual seq_len
        # before adding to x
        subset = self.pe[:,:seq_len,:]
        #print(subset.size())
        
        pe = Variable(subset, requires_grad=False)
        
        if x.is_cuda:
            pe.cuda()
        
        x = x + pe
        return self.dropout(x)

## Test module

In [69]:
VOCAB_SIZE = 20000
EMB_DIM = 512

In [70]:
e = Embedder(VOCAB_SIZE, EMB_DIM)

In [79]:
bs = 3
seq_len = 6
src = torch.randint(3, 100, (bs,seq_len))

In [72]:
emb = e(src)

In [73]:
emb.size()

torch.Size([3, 6, 512])

In [74]:
p = PositionalEmbedding(emb_dim=EMB_DIM)

In [75]:
p.pe.size()

torch.Size([1, 200, 512])

In [76]:
p_emb = p(emb)

In [77]:
p_emb.size()

torch.Size([3, 6, 512])

# Batch Normalization

Normalisation is highly important in deep neural networks. It prevents the range of values in the layers changing too much, meaning the model trains faster and has better ability to generalise.

<center>
    <img src="https://miro.medium.com/max/511/1*4w3sQ14caDRkrQsAeK5Flw.png" width="400">
</center>

We will be normalising our results between each layer in the encoder/decoder, so before building our model let’s define that function:

- [blog](https://kharshit.github.io/blog/2018/12/28/why-batch-normalization)

<center>
    <img src="https://kharshit.github.io/img/batch_normalization.png" width="400">
</center>

In [78]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
        self.eps = eps
    
    def forward(self, x):
        
        x_mean = x.mean(dim=-1, keepdim=True)
        x_variance = x.std(dim=-1, keepdim=True) 
        
        normalized_x = (x - x_mean) / (x_variance + self.eps)
        
        # scale and shift
        y = self.alpha * normalized_x + self.bias
        return y

## Test module

In [82]:
d_model = 512
bs = 3
seq_len = 6
x = torch.rand(size=(bs,seq_len, d_model))
x.size()

torch.Size([3, 6, 512])

In [84]:
n = Norm(d_model=d_model)

In [86]:
n_out = n(x)
n_out.size()

torch.Size([3, 6, 512])

# Attention

<center>
    <img src="https://miro.medium.com/max/445/1*evdACdTOBT5j1g1nXialBg.png" width="400">
</center>


<center>
    <img src="https://miro.medium.com/max/140/1*15E9qKg9bKnWdSRWCyY2iA.png" width="100">
</center>

- Initially we must multiply $Q$ by the transpose of $K$. This is then `scaled` by dividing the output by the square root of $d_k$.
- A step that’s not shown in the equation is the `masking operation`. Before we perform `Softmax`, we apply our mask and hence reduce values where the input is padding (or in the decoder, also where the input is ahead of the current word).

Another step not shown is `dropout`, which we will apply after `Softmax`.

Finally, the last step is doing a `dot` product between the result so far and $V$.

In [87]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

# Multi-Headed Attention

Once we have our embedded values (with positional encodings) and our masks, we can start building the layers of our model.

Here is an overview of the multi-headed attention layer:


<center>
    <img src="https://miro.medium.com/max/523/1*1tsRtfaY9z6HxmERYhw8XQ.png" width="300">
</center>

- $V$, $K$ and $Q$ stand for `key`, `value` and `query`. These are terms used in attention functions

- In the case of the **Encoder**, $V, K$ and $G$ will simply be **identical copies** of the `emb_vector + pos_encoding`. 
- They will have the dimensions `Batch_size * seq_len * d_model`

<center>
    <img src="images/tensor_dimension.png" width="400">
</center>

- In multi-head attention we `split the embedding vector` into `N heads`. $d_{model}^{new} = \frac{d_{model}}{N}$. so they will then have the dimensions `batch_size * N * seq_len * (d_model / N)`.
- This final dimension `(d_model / N )` we will refer to as `d_k`

- Drawing tool [Excalidraw](https://excalidraw.com/)

In [88]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * N * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        output = self.out(concat)
    
        return output

## Test module

In [89]:
d_model = 512
heads = 4
bs = 3
seq_len = 6

In [90]:
mha = MultiHeadAttention(heads=heads, d_model=d_model)

In [91]:
q = torch.rand(size=(bs,seq_len, d_model))
k = torch.rand(size=(bs,seq_len, d_model))
v = torch.rand(size=(bs,seq_len, d_model))

In [92]:
output = mha(q,k,v)
output.size() # batch_size x sequence_length x model_dimension 

torch.Size([3, 6, 512])

## Feedforward Network

In [93]:
class FeedForward(nn.Module):
    def __init__(self, d_model:int, d_ff:int = 2048, dropout_pct:float = 0.1):
        super().__init__()
        
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout_pct)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x

## Is dropout applied before or after the non-linear activation function?

Typically, dropout is applied after the non-linear activation function (a). However, when using rectified linear units (ReLUs), it might make sense to apply dropout before the non-linear activation (b) for reasons of computational efficiency depending on the particular code implementation.

- (a): Fully connected, `linear activation` -> `ReLU` -> `Dropout` -> … [traditional use]
- (b): Fully connected, `linear activation` -> `Dropout` -> `ReLU` -> …

**Reference:**
- [Sebastian Rachka](https://sebastianraschka.com/faq/docs/dropout-activation.html)

## Test module

In [94]:
d_model = 512
bs = 3
seq_len = 6

In [95]:
x = torch.rand(size=(bs,seq_len, d_model))

In [96]:
ff = FeedForward(d_model=d_model)

In [97]:
output = ff(x)

In [98]:
output.size()

torch.Size([3, 6, 512])


<center>
    <img src="images/attention_layer.jpg" width="700">
</center>

# Basic Encoder Layer

In [99]:
class EncoderLayer_basic(nn.Module):
    def __init__(self, d_model, heads, dropout_pct = 0.1):
        super().__init__()
        
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout_pct)
        
        self.ff = FeedForward(d_model, dropout_pct = dropout_pct)
        self.dropout_1 = nn.Dropout(dropout_pct)
        self.dropout_2 = nn.Dropout(dropout_pct)
        
    def forward(self, x, mask):
        # q = k = v = x
        x = x + self.dropout_1(self.attn(x,x,x,mask))
        x = self.norm_1(x)
        
        x = x + self.dropout_2(self.ff(x))
        x = self.norm_2(x)
        
        return x

**Note:** The above implementation creates the `forward` function based on the above encoder image. Which is slightly different from this blog implemention [link](https://github.com/SamLynnEvans/Transformer/blob/master/Layers.py), where their `add` and `norm` operation order is slightly different

```python
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        pass
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
```

## Test Module

In [100]:
d_model = 512
heads = 4
bs = 3
seq_len = 6

el = EncoderLayer_basic(d_model, heads)

x = torch.rand(size=(bs,seq_len, d_model))

output = el(x,mask=None)
output.size()

torch.Size([3, 6, 512])

# Basic Decoder Layer 

In [101]:
class DecoderLayer_basic(nn.Module):
    def __init__(self, d_model, heads, dropout_pct = 0.1):
        super().__init__()
        
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.attn_decoder = MultiHeadAttention(heads, d_model)
        self.attn_encoder = MultiHeadAttention(heads, d_model)
        
        self.ff = FeedForward(d_model)
        
        self.dropout_1 = nn.Dropout(dropout_pct)
        self.dropout_2 = nn.Dropout(dropout_pct)
        self.dropout_3 = nn.Dropout(dropout_pct)
        
    def forward(self, x, encoder_output, src_mask, trg_mask):
        
        """
        x: this x comes from the target language
        """
        
        q = k = v = x
        x = x + self.dropout_1(self.attn_decoder(q,k,v,trg_mask))
        x = self.norm_1(x)
        
        k_enc = v_enc = encoder_output
        x = x + self.dropout_2(self.attn_encoder(x, k_enc, v_enc, src_mask))
        x = self.norm_2(x)
        
        x = x + self.dropout_3(self.ff(x))
        x = self.norm_3(x)
        
        return x

## Test Module

In [102]:
d_model = 512
heads = 4
bs = 3
seq_len = 6

In [103]:
el = EncoderLayer_basic(d_model, heads)

In [104]:
x_src = torch.rand(size=(bs,seq_len, d_model))
x_trg = torch.rand(size=(bs,seq_len, d_model))

In [105]:
enc_output = el(x_src,mask=None)
enc_output.size()

torch.Size([3, 6, 512])

In [106]:
dl = DecoderLayer_basic(d_model, heads)

dec_output = dl(x_trg, enc_output, src_mask=None, trg_mask=None)

dec_output.size()

torch.Size([3, 6, 512])

# Add-Norm Module

The above implementation of `EncoderLayer_basic()`, `DecoderLayer_basic()` ic correct. However we can create an `Add-Norm` module to make the code more modular and get rid of repeatitive code-block

In [107]:
class AddNorm(nn.Module):
    def __init__(self, d_model, dropout_pct = 0.1):
        super().__init__()
        self.norm = Norm(d_model)
        self.dropout = nn.Dropout(dropout_pct)
        
    def forward(self, x, attn_output):
        # add
        x = x + self.dropout(attn_output)
        
        # normalize
        x = self.norm(x)
        
        return x

# Improved Encode Layer with AddNorm module

In [108]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout_pct = 0.1):
        super().__init__()
        
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout_pct)
        self.ff = FeedForward(d_model, dropout_pct = dropout_pct)
        self.add_norm_1 = AddNorm(d_model, dropout_pct)
        self.add_norm_2 = AddNorm(d_model, dropout_pct)
        
    def forward(self, x, mask):
        # q = k = v = x
        x = self.add_norm_1(x,self.attn(x,x,x,mask))
        x = self.add_norm_2(x, self.ff(x))
        return x

## Test Module

In [109]:
d_model = 512
heads = 4
bs = 3
seq_len = 6

el_1 = EncoderLayer(d_model, heads)

x = torch.rand(size=(bs,seq_len, d_model))

output = el_1(x,mask=None)
output.size()

torch.Size([3, 6, 512])

In [110]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout_pct = 0.1):
        super().__init__()
        
        self.attn_decoder = MultiHeadAttention(heads, d_model)
        self.attn_encoder = MultiHeadAttention(heads, d_model)
        
        self.ff = FeedForward(d_model)
        
        self.add_norm_1 = AddNorm(d_model, dropout_pct)
        self.add_norm_2 = AddNorm(d_model, dropout_pct)
        self.add_norm_3 = AddNorm(d_model, dropout_pct)
        
        
        
    def forward(self, x, encoder_output, src_mask, trg_mask):
        
        """
        x: this x comes from the target language
        """
        
        q = k = v = x
        x = self.add_norm_1(x ,self.attn_decoder(q,k,v,trg_mask))
        
        k_enc = v_enc = encoder_output
        x = self.add_norm_2(x, self.attn_encoder(x, k_enc, v_enc, src_mask))
        
        x = self.add_norm_3(x, self.ff(x))
        
        return x

## Test Module

In [111]:
d_model = 512
heads = 4
bs = 3
seq_len = 6

el = EncoderLayer(d_model, heads)
x_src = torch.rand(size=(bs,seq_len, d_model))
enc_output = el(x_src,mask=None)


dl = DecoderLayer(d_model, heads)
x_trg = torch.rand(size=(bs,seq_len, d_model))
dec_output = dl(x_trg, enc_output, src_mask=None, trg_mask=None)

dec_output.size()

torch.Size([3, 6, 512])

# Replication of EncoderLayer and DecoderLayer

One last Variable: If you look at the diagram closely you can see a `Nx` next to the encoder and decoder architectures. In reality, the encoder and decoder in the diagram above represent one layer of an encoder and one of the decoder. `N` is the variable for the number of layers there will be. Eg. if `N=6`, the data goes through six encoder layers (with the architecture seen above), then these outputs are passed to the decoder which also consists of six repeating decoder layers.

In [112]:
import copy

In [113]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [114]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_encoder_layer, heads, dropout):
        super().__init__()
        self.n_encoder_layer = n_encoder_layer
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEmbedding(d_model)
        enc_layer = EncoderLayer(d_model, heads, dropout)
        # stack enc_layers
        self.enc_layers = get_clones(enc_layer, n_encoder_layer)
        
    def forward(self, src_token, mask):
        x = self.embed(src_token)
        x = self.pe(x)
        
        for i in range(self.n_encoder_layer):
            x = self.enc_layers[i](x, mask)
        
        # the EncoderLayer return normalized x
        # so no need to pass through Norm() again
        
        return x

In [121]:
d_model = 512
heads = 4
bs = 3
seq_len = 6
vocab_size = 2000

enc = Encoder(vocab_size, d_model, 
              n_encoder_layer=6, heads=heads, dropout=0.1)

In [122]:
x_src = torch.randint(3, vocab_size, (bs,seq_len))
x_src.size()

torch.Size([3, 6])

In [123]:
enc_output = enc(x_src,mask=None)

In [124]:
enc_output.size()

torch.Size([3, 6, 512])

### Why the encoder output shape is this? 

In [125]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_decoder_layer, heads, dropout):
        super().__init__()
        self.n_decoder_layer = n_decoder_layer
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEmbedding(d_model)
        dec_layer = DecoderLayer(d_model, heads, dropout_pct=dropout)
        self.dec_layers = get_clones(dec_layer, n_decoder_layer)
        self.norm = Norm(d_model)
        
    def forward(self,trg, enc_output, src_mask, trg_mask):
        x = self.pe(self.embed(trg))
        for i in range(self.n_decoder_layer):
            x = self.dec_layers[i](x, enc_output, src_mask, trg_mask)
          
        # the DecoderLayer returns normalized x
        # so no need to pass through Norm() again
        
        return x

In [126]:
dec = Decoder(vocab_size, d_model, 
              n_decoder_layer=6, heads=heads, dropout=0.1)

In [128]:
x_trg = torch.randint(low=3, high=vocab_size, size= (bs,seq_len))

In [129]:
dec_output = dec(x_trg, enc_output, src_mask=None, trg_mask=None)

In [130]:
dec_output.size()

torch.Size([3, 6, 512])

In [131]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, n_layers, heads, dropout = 0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, n_layers, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, n_layers, heads, dropout)
        self.linear = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask, trg_mask):
        
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(trg, enc_output, src_mask, trg_mask)
        output = self.linear(dec_output)
        
        return output
    
# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

In [132]:
tfmr = Transformer(vocab_size, vocab_size, d_model, 
                   n_layers=6, heads=heads, dropout=0.1)

In [133]:
x_src = torch.randint(low=3, high=vocab_size, size= (bs,seq_len))
x_trg = torch.randint(low=3, high=vocab_size, size= (bs,seq_len))

In [134]:
output = tfmr(x_src, x_trg, src_mask=None, trg_mask=None)

In [135]:
output.size()

torch.Size([3, 6, 2000])