<a href="https://colab.research.google.com/github/lorrespz/NLP-text-analysis/blob/main/Transformers_Pytorch_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Lazy Programmer's Transformers course
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

# Multihead Attention Block

Recall the formula:

   Attention($Q, K, D$) = softmax$\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V$

  where:

   $ Q = W^Q Q_{input}$

   $ K = W^K K_{input}$

   $V = W^V V_{input}$


In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d, k, d_model, n_heads):
    super().__init__()

    #Assume d_v = d_k (len(Q) = len(K) = d_k, len(V) = d_v)
    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model, d_k*n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)

    #final linear layer
    self.fc = nn.Linear(d_k*n_heads, d_model)

    def forward(self, q, k, v, mask = None):
      q = self.query(q)   # N x T x (hd_k)
      k = self.key(k)     # N x T x (hd_k)
      v = self.value(v)     # N x T x (hd_v)
      #h = n_heads

      # N = batch size
      N = q.shape[0]
      # T = sequence length
      T = q.shape[1]

      #change the shape to:
      # (N, T, h, d_k) --> (N, h, T, d_k)
      q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
      k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
      v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

      #compute attention weights
      # q * k^T
      #(N,  h, T,  d_k) x (N, h, d_k, T) --> (N, h, T, T)
      #transposing the last 2 dimensions of k
      attn_scores = q @ k.transpose(-2, -1)/math.sqrt(self.d_k)
      #apply the mask, which is a tensor of size (N,T) of values 0, 1
      #for each of the N samples, need to know which of the T tokens is important
      #Change from 2D to 4D by adding None, which introduces superfluous dim of size 1
      # (N, T) --> (N, 1, 1, T)
      if mask is not None:
        #mask_fill(arg1, arg2): if arg1 = True, apply arg2
        #softmax(-inf) = 0
        attn_scores = attn_scores.masked_fill(mask[:, None, None,:] == 0, float('-inf'))
      attn_weights = F.softmax(attn_scores, dim = -1)

      #compute attention-weighted values
      #(N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
      A = attn_weights @ v

      #reshape it back before the final linear layer
      A = A.transpose(1, 2) # (N, T, h, d_k)
      A = A.contiguous().view(N, T, self.d_k*self.n_heads) #(N, T, h*d_k)

      #final step is to project A with the Linear layer to
      #get the same shape as the input sequence
      return self.fc(A)


# Transformer Block

In [None]:
class TransformerBlock(nn.Module):
