<a href="https://colab.research.google.com/github/martinpius/SEQUENCE-MODELS-FINAL/blob/main/Transformers_Network_Attention_is_All_You_Need!_Implementation_from_scratch_with_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Prepare the CoLaB environment, Loading the Google Drive and the GPU device when available:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import torch
  print(f">>>> You are using Google CoLaB with torch version: {torch.__version__}")
except Exception as e:
  print(f">>>> {type(e)} {e}\n>>>> please correct {type(e)} and reload your device")
  COLAB = False
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
def time_fmt(t: float = 231.123)->float:
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) / 60)
  s = int(t % 60)
  return f"hrs: {h}, min: {m:>02}, sec: {s:>05.2f}"
print(f">>>> testing the time formating function...........\n>>>> time elapsed\t{time_fmt()}")

Mounted at /content/drive
>>>> You are using Google CoLaB with torch version: 1.9.0+cu102
>>>> testing the time formating function...........
>>>> time elapsed	hrs: 0, min: 03, sec: 51.00


In [2]:
# In this notebook we are going to implement a machine translation network (The transformer) from
# scratch in Pytorch. The inspiration comes from the paper "Attention is only you need ==> url == https://arxiv.org/abs/1706.03762"
# Finaly the application with multi30k dataset will be demonstrated.


In [3]:
import torch
from torch import nn
from torch import optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
from tensorflow import summary
from tqdm import tqdm
import spacy, math
import random, time, datetime
import os, sys
%load_ext tensorboard

In [4]:
# Set the seed values for reproducability and the gpu to deterministic
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [5]:
# We start by the Multi-head attention: This is the most important part
# of the network as all the magic happens. Transformers network archieved
# their best performance maily because of the multi-head attention.

In [6]:
class MultiHeadAttention(nn.Module):
  def __init__(self, embed_size, heads):
    super(MultiHeadAttention, self).__init__()
    self.embed_size = embed_size
    self.heads = heads
    self.head_dim = embed_size // heads
    assert (self.head_dim * heads == embed_size), "Embedding size needs to be divisible by heads"
    self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
    self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
    self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
    self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
  
  def forward(self, values, keys, query, mask):
    # Get number of training examples
    N = query.shape[0]
    value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
    # Split the embedding into self.heads different pieces
    values = values.reshape(N, value_len, self.heads, self.head_dim)
    keys = keys.reshape(N, key_len, self.heads, self.head_dim)
    query = query.reshape(N, query_len, self.heads, self.head_dim)
    values = self.values(values)  # (N, value_len, heads, head_dim)
    keys = self.keys(keys)  # (N, key_len, heads, head_dim)
    queries = self.queries(query)  # (N, query_len, heads, heads_dim)
    # Einsum does matrix mult. for query*keys for each training example
    # with every other training example, don't be confused by einsum
    # it's just how I like doing matrix multiplication & bmm
    energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
    # queries shape: (N, query_len, heads, heads_dim),
    # keys shape: (N, key_len, heads, heads_dim)
    # energy: (N, heads, query_len, key_len)
    # Mask padded indices so their weights become 0
    if mask is not None:
      energy = energy.masked_fill(mask == 0, float("-1e20"))
      # Normalize energy values similarly to seq2seq + attention
      # so that they sum to 1. Also divide by scaling factor for
      # better stability
      attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
      # attention shape: (N, heads, query_len, key_len)
      out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim)
      # attention shape: (N, heads, query_len, key_len)
      # values shape: (N, value_len, heads, heads_dim)
      # out after matrix multiply: (N, query_len, heads, head_dim), then
      # we reshape and flatten the last two dimensions.
      out = self.fc_out(out)
      # Linear layer doesn't modify the shape, final shape will be
      # (N, query_len, embed_size)
    return out

In [7]:
# The transformer block: This block is bassically combination of feed-forward nets
# we have layers of fc, residual connections and normalization layer:
class TransformerBlock(nn.Module):
  def __init__(self, embeded, heads, dropout, f_expansion):
    super(TransformerBlock, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.attention = MultiHeadAttention(embeded, heads)
    self.norm1 = nn.LayerNorm(embeded)
    self.norm2 = nn.LayerNorm(embeded)
    self.fc = nn.Sequential(
        nn.Linear(embeded, f_expansion * embeded),
        nn.ReLU(),
        nn.Linear(f_expansion * embeded, embeded))
  
  def forward(self, values, keys, query, mask):
    attention = self.attention(values, keys, query, mask) # the multihead attention
    # add the skip-connection to the layer norm of the attention out
    x = self.dropout(self.norm1(attention + query))
    # pass the output to the fc layer
    f_out = self.fc(x)
    # we again add a residual connection and pass in the layer norm 
    output = self.dropout(self.norm2(f_out + x))
    return output



In [8]:
# We can now build our network (the encoder, decoder) using the transformer block
# which is going to be repeated several times

In [9]:
class Encoder(nn.Module):
  def __init__(self, input_vocab_size,
               embeded_dim,num_layers,
               heads, device,f_expansion,
               dropout, max_len):
    super(Encoder, self).__init__()
    self.embeded_dim = embeded_dim
    self.dropout = nn.Dropout(dropout)
    self.device = device
    # encoder's embedding (the input texts)
    self.source_embd = nn.Embedding(input_vocab_size, embeded_dim)
    # possitional embedding to make the network invariant to structural changes
    self.pos_embd = nn.Embedding(max_len, embeded_dim)
    # constructing the transformer (in the decoder's block)
    self.layers = nn.ModuleList([
                            TransformerBlock(embeded_dim, heads, dropout, f_expansion,)
    for _ in range(num_layers)])
  
  def forward(self, input_tensor, mask):
    batch_size, seq_len = input_tensor.shape
    pos = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device = device)
    out = self.dropout(self.source_embd(input_tensor) + self.pos_embd(pos))
    for layer in self.layers:
      # for the encoder keys, querry and values are of the same dim
      out = layer(out, out, out, mask)
    return out


In [10]:
# The decoder class: Its uses masked multi-head attention from the dec-input and
# the output of the encoder's network to produce predictions at every time stamp
class DecoderBlock(nn.Module):
  def __init__(self, embeded_size, heads, f_expansion, dropout, device):
    super(DecoderBlock, self).__init__()
    self.attention = MultiHeadAttention(embeded_size, heads)
    self.transformerblock = TransformerBlock(embeded_size, heads, dropout, f_expansion)
    self.dropout = nn.Dropout(dropout)
    self.device = device
    self.norm1 = nn.LayerNorm(embeded_size)
  
  def forward(self, x, values, keys, src_mask, trg_mask):
    attention = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm1(attention + x))
    out = self.transformerblock(values, keys, query, src_mask)
    return out

# We can now create our decoder network with the aid of the above block
class Decoder(nn.Module):
  def __init__(self, trg_voc_size,embeded_size, num_layers,
               heads, f_expansion,dropout, device, max_len):
    super(Decoder, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.device = device
    self.dec_embed = nn.Embedding(trg_voc_size, embeded_size)
    self.pos_embd = nn.Embedding(max_len, embeded_size)
    self.layers = nn.ModuleList([
                                DecoderBlock(embeded_size, heads, f_expansion, dropout, device)
                                for _ in range(num_layers)])

    self.fc_out = nn.Linear(embeded_size, trg_voc_size)

  def forward(self, x, enc_out, src_mask, trg_mask):
    batch_size, seq_len = x.shape
    #define possitional embedding protal-type
    pos = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device = device)
    x = self.dropout(self.dec_embed(x) + self.pos_embd(pos))
    # decoder layers
    for layer in self.layers:
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)
    out = self.fc_out(x)
    return out


In [11]:
# We now combine the above classes to build the final model-class:
class TrfModel(nn.Module):
  def __init__(self, src_voc_size,
               trg_voc_size, src_pad_idx,
               trg_pad_idx,
               embed_size = 256,
               num_layers = 6,
               f_expansion = 4,
               heads = 8, dropout = 0,
               device = device,
               max_len = 100):
    super(TrfModel,self).__init__()
    self.encoder = Encoder(src_voc_size,
                           embed_size, 
                           num_layers,
                           heads,
                           device, 
                           f_expansion, 
                           dropout, max_len)
    self.decoder = Decoder(trg_voc_size, embed_size,
                           num_layers, heads,
                           f_expansion, dropout,
                           device, max_len)
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device
  
  def src_mask_build(self, src_input):
    src_mask = (src_input != self.src_pad_idx).unsqueeze(1).unsqueeze(2) # shape == [batch_size, 1, 1, src_len]
    return src_mask.to(device = self.device)
  
  def trg_mask_build(self, trg_input):
    # its uses lower triangular matrix-type
    batch_size, trg_len = trg_input.shape
    trg_mask = torch.tril(torch.ones(trg_len, trg_len)).expand(batch_size, 1, trg_len, trg_len)
    return trg_mask.to(device = self.device)
  
  def forward(self, src, trg):
    src_mask = self.src_mask_build(src) # build the mask for the input sequence
    trg_mask = self.trg_mask_build(trg) # build the mask for the output sequence
    enc_src = self.encoder(src, src_mask)
    outputs = self.decoder(trg, enc_src, src_mask, trg_mask)
    return outputs


In [12]:
# Instantiating and testing the model class:
source_input = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 0], [1, 8, 7, 3, 4, 5, 6, 7]]).to(device = device)
target = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 0], [1, 8, 7, 3, 4, 5, 7, 2]]).to(device = device)
src_pad_idx = 0
trg_pad_idx = 0
src_voc_size = 10
trg_voc_size = 10
model = TrfModel(src_voc_size, trg_voc_size, src_pad_idx, trg_pad_idx).to(device = device)
#outputs = model(source_input, target[:, : -1])
outputs = model(source_input, target[:, :-1])
print(f">>>> the desired output shape: {outputs.shape}")


>>>> the desired output shape: torch.Size([2, 7, 10])
