<a href="https://colab.research.google.com/github/lorrespz/Transformers-Language-Models-Pytorch-implementation/blob/main/Transformers_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Encoder architecture from scratch

This code is from Lazy Programmer's Transformers course

https://www.udemy.com/course/data-science-transformers-nlp/

In [1]:

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

# Multihead Attention Block

Recall the formula:

   Attention($Q, K, D$) = softmax$\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V$

  where:

   $ Q = W^Q Q_{input}$

   $ K = W^K K_{input}$

   $V = W^V V_{input}$


In [3]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d, k, d_model, n_heads):
    super().__init__()

    #Assume d_v = d_k (len(Q) = len(K) = d_k, len(V) = d_v)
    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model, d_k*n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)

    #final linear layer
    self.fc = nn.Linear(d_k*n_heads, d_model)

    def forward(self, q, k, v, mask = None):
      q = self.query(q)   # N x T x (hd_k)
      k = self.key(k)     # N x T x (hd_k)
      v = self.value(v)     # N x T x (hd_v)
      #h = n_heads

      # N = batch size
      N = q.shape[0]
      # T = sequence length
      T = q.shape[1]

      #change the shape to:
      # (N, T, h, d_k) --> (N, h, T, d_k)
      q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
      k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
      v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

      #compute attention weights
      # q * k^T
      #(N,  h, T,  d_k) x (N, h, d_k, T) --> (N, h, T, T)
      #transposing the last 2 dimensions of k
      attn_scores = q @ k.transpose(-2, -1)/math.sqrt(self.d_k)
      #apply the mask, which is a tensor of size (N,T) of values 0, 1
      #for each of the N samples, need to know which of the T tokens is important
      #Change from 2D to 4D by adding None, which introduces superfluous dim of size 1
      # (N, T) --> (N, 1, 1, T)
      if mask is not None:
        #mask_fill(arg1, arg2): if arg1 = True, apply arg2
        #softmax(-inf) = 0
        attn_scores = attn_scores.masked_fill(mask[:, None, None,:] == 0, float('-inf'))
      attn_weights = F.softmax(attn_scores, dim = -1)

      #compute attention-weighted values
      #(N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
      A = attn_weights @ v

      #reshape it back before the final linear layer
      A = A.transpose(1, 2) # (N, T, h, d_k)
      A = A.contiguous().view(N, T, self.d_k*self.n_heads) #(N, T, h*d_k)

      #final step is to project A with the Linear layer to
      #get the same shape as the input sequence
      return self.fc(A)


# Transformer Block

In [2]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob)
        )
    self.dropout = nn.Dropout(p = dropout_prob)

    def forward(self, x, mask = None):
      #x is an input sequence of size (NxTXD)
      # mask is of size (NxT)
      #FIRST LAYER NORM:
      #pass x in as the query, key, value into the multihead attention block
      #then add the output to the residual 'x' to be passed in the 1st layer norm
      x = self.ln1(x+ self.mha(x,x,x,mask))
      # SECOND LAYER NORM: ann + x
      x = self.ln2(x + self.ann(x))
      x = self.dropout(x)
      return(x)

# Positional Encoding Block


$PE_{(pos, 2i)} = \sin(pos/10000^{2i/d_{model}})$

$PE_{(pos, 2i+1)} = \cos(pos/10000^{2i/d_{model}})$

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len = 2048, dropout_prob = 0.1):
    super().__init__()
    self.dropout = nn.Dropout(p = dropout_prob)
    #unsqueeze(1) adds a superfluous dim of size 1 at the end
    #so that we have a 2d array of size (max_len, 1)
    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position*div_term)
    pe[0, :, 1::2] = torch.cos(position*div_term)
    self.register_buffer('pe', pe)

    def forward(self, x):
      #x shape: NxTxD
      x  = x + self.pe[:,:,x.size(1), :]
      return self.dropout(x)

# Transformer Encoder

In [5]:
class Encoder(nn.Module):
  def __init__(self, vocab_size,
               max_len, d_k, d_model, n_heads, n_layers, n_classes, dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformers_blocks = [TransformerBlock(d_k, d_model, n_heads, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, n_classes)

    def forward(self, x, mask = None):
      x = self.embedding(x)
      x = self.pos_encoding(x)
      for block in self.transformer_blocks:
        x = block(x, mask)

      #depends on the kind of tasks that we need, here:
      #many-to-one (x has the shape N x T x D)
      x = x[:, 0, :]
      x = self.ln(x)
      x = self.fc(x)

      return x

# Test the encoder

In [7]:
#nodel = Encoder(20000, 1024, 16, 64, 4, 2, 5, 0.1)