<a href="https://colab.research.google.com/github/martinpius/PYTORCH/blob/main/MTL_with_Attention_in_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import torch
  print(f"You are on CoLaB with Pytorch Version: {torch.__version__}")
except Exception as e:
  print(f"{type(e)}: {e}\n>>>>please correct {type(e)} and reload your device")
  COLAB = False
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
def time_fmt(t: float = 123.891)->float:
  h = int(t / (60 *60))
  m = int(t % (60 * 60) / 60)
  s = int(t % 60)
  return f"{h}: {m:>02}: {s:>05.2f}"
print(f">>>>>time formating:\tplease wait....\n>>>>>time elapsed:\t{time_fmt()}")

Mounted at /content/drive
You are on CoLaB with Pytorch Version: 1.8.1+cu101
>>>>>time formating:	please wait....
>>>>>time elapsed:	0: 02: 03.00


In [2]:
#In this notbook we are going to train a Machine translation we are going to train a machine translation Model
#with an attention mechanism in Pytorch. For demonstration we will apply data from torchtext (Multi30k)

In [3]:
#Importing the required modules:
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import math, sys, time, random, spacy


In [4]:
#Set the random seed for reproducability and device (gpu to deterministic to avoid errors)

In [5]:
seed = 2134
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [6]:
#We start by installing and loading the tokenizers for data preprocessing:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')#restart for effective loading


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [7]:
#define the tokenizer functions to be used in the Field module:
def en_tokenizer(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]
def de_tokenizer(text):
  return [tok.text for tok in spacy_de.tokenizer(text)]

In [8]:
#Define the field objects for both languages to be used in Multi30k module:
germany = Field(tokenize = de_tokenizer, init_token = '<sos>', eos_token = '<eos>', lower = True)
english = Field(tokenize = en_tokenizer, init_token = '<sos>', eos_token = '<eos>', lower = True)



In [9]:
#Load and preprocess the data using the above fields (we load the data with the help of Multi30k.splits module)
train_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (germany, english))


downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 961kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 276kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 263kB/s]


In [10]:
#print out number of examples in each dataset
print(f"Total samples in train dataset: {len(train_data.examples)}\nTotal samples in validation dataset: {len(validation_data.examples)}\nTotal samples in test dataset: {len(test_data.examples)}")

Total samples in train dataset: 29000
Total samples in validation dataset: 1014
Total samples in test dataset: 1000


In [11]:
#Print and examine the first example (paired data) for the training set:
print(f">>>>>The first sample in a train data is a paired sentences:\n{vars(train_data.examples[0])}")

>>>>>The first sample in a train data is a paired sentences:
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [12]:
#We then build the vocabulary for the train dataset only to allow generalization 
#We only pick the tokens which appeared at least twice in a tokenized list:
germany.build_vocab(train_data, min_freq = 2)
english.build_vocab(train_data, min_freq = 2)

In [13]:
#We finaly build our iterator object ready to stream-in data during training and validation later after building the model
#We use the BucketIterator.splits method to do the proper splitting for us. we consider batch_size of 128
batch_size = 128
train_iter, validation_iter, test_iter = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    device = device)

In [14]:
#We start building our Model step by step by firstly defining the decoder class, Attention class, encoder class and finaly
#combining them to get the model class

In [15]:
class Encoder(nn.Module):
  def __init__(self, enc_input, enc_embd, enc_hidden, dec_hidden, dropout):
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(enc_input, enc_embd)
    self.rnn = nn.GRU(enc_embd, enc_hidden, bidirectional = True)
    self.fc = nn.Linear(enc_hidden * 2, dec_hidden)
  
  def forward(self, enc_input):
    '''
    enc_input shape = [enc_input_len = seq_len, batch_size]
    embeded shape = [enc_input_len = seq_len, batch_size, embd_dim]
    enc_output shape = [enc_input_len, batch_size, enc_hidden*2]
    enc_hidden shape = [num_layers*2, batch_size, enc_hidden_dim]
    enc_hidden[-1,:,:] = is the last of the backward rnn
    enc_hidden[-2,:,:] = is the last of the forward rnn
    #We need to stack backward and forward rnn hidden results for future use in the decoder
    '''
    embeded = self.dropout(self.embedding(enc_input))
    enc_output, enc_hidden = self.rnn(embeded)
    hidden = torch.tanh(self.fc(torch.cat((enc_hidden[-2,:,:], enc_hidden[-1,:,:]), dim = 1)))
    return enc_output, hidden


In [16]:
#The attention class: To compute information about the decoder's hidden layer with respect
#to the input tokens we define the following class:
class Attention(nn.Module):
  def __init__(self, enc_hidden, dec_hidden):
    super().__init__()
    self.attn = nn.Linear((2*enc_hidden) + dec_hidden, dec_hidden)
    self.v = nn.Linear(dec_hidden, 1, bias = False)
  
  def forward(self, hidden, enc_outputs):
    '''
    hidden shape = [batch_size, dec_hid_dim]
    enc_output shape = [input_len, batch_size, enc_hidden*2]
    we repeat the decoder hidden times the length of encoder's input
    before concatenating with encoder outputs we need to reshape the encoder's output
    '''
    batch_size = enc_outputs.shape[1]
    enc_len = enc_outputs.shape[0]
    hidden = hidden.unsqueeze(1).repeat(1,enc_len,1) # new shape [batch_size, enc_len, dec_hidden_dim]
    enc_outputs = enc_outputs.permute(1,0,2) #new shape [batch_size, enc_len, enc_hiden*2]
    energy = torch.tanh(self.attn(torch.cat((hidden, enc_outputs), dim = 2))) #shape = [batch_size, enc_len, dec_hidden]
    attention = self.v(energy).squeeze(2) #shape = [batch_size, enc_input_len]
    return nn.functional.softmax(attention, dim = 1)



In [17]:
#We then defines our decoder class that takes as the input the outputs of the above classes (encoder and attention)

In [18]:
class Decoder(nn.Module):
  def __init__(self,output_dim,dec_embd, dec_hidden, enc_hidden, attention, dropout):
    super().__init__()
    self.attention = attention
    self.output_dim = output_dim
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(output_dim,dec_embd)
    self.rnn = nn.GRU((enc_hidden * 2) + dec_embd, dec_hidden)
    self.fc = nn.Linear((enc_hidden * 2) + dec_embd + dec_hidden, output_dim)
  
  def forward(self, input, hidden, enc_output):
    '''
    this method will compute the predictions using as inputs attention, dec_hidden and enc outputs
    input shape: [batch_size]
    hidden shape : [batch_size, dec_hidden_dim]
    enc_output shape : [enc_input_len, batch_size, enc_hidden*2]
    '''
    input = input.unsqueeze(0) # new shape: [batch_size, 1]
    embeded = self.dropout(self.embedding(input)) # shape [batch_size, 1, embeded_dim]
    a = self.attention(hidden, enc_output) #shape: [batch_size, enc_input_len]
    a = a.unsqueeze(1) # new shape is [batch_size, 1, enc_input_len]
    enc_output = enc_output.permute(1,0,2) #new shape [batch_size, enc_input_len, enc_hidden*2]
    weighted = torch.bmm(a, enc_output) #shape [1, batch_size, enc_hidden *2]
    weighted = weighted.permute(1,0,2) #new shape = [batch_size, 1, enc_hidden*2]
    rnn_input = torch.cat((embeded, weighted), dim = 2) #shape = [batch, 1, enc_hidden*2 + embeded_dim]
    output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
    #output shape: [dec_out_len = 1, batch_size, dec_hidden*2]
    #hidden = [1, batch_size, dec_hidden_dim]
    output = output.squeeze(0)
    embeded = embeded.squeeze(0)
    weighted = weighted.squeeze(0)
    #compute the predictions
    preds = self.fc(torch.cat((output, weighted, embeded), dim = 1)) #shape = [batch_size, output_dim]
    return preds, hidden.squeeze(0)


In [19]:
#The model Class: We finally build the model by combining the above classes:

In [20]:
class MTL(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
  def forward(self, input, target, teacher_force_ratio = 0.5):
    '''
    input shape: [input_len, batch_size]
    target shape: [target_len, batch_size]
    We will apply techer force learning technique 50% of times == ground truth else best guess
    '''
    batch_size = input.shape[1]
    target_len = target.shape[0]
    target_voc_size = self.decoder.output_dim
    #define a storage container to keep the predictions
    outputs = torch.zeros(target_len, batch_size, target_voc_size).to(self.device)
    enc_outputs, hidden = self.encoder(input)
    #Get the first decoder's input = 'sos' token
    inp_first = target[0,:]
    #We the iterate over the decoder's input length (english list of tokens)
    for t in range(1, target_len):
      output, hidden = self.decoder(inp_first, hidden, enc_outputs)
      outputs[t] = output # storing the prediction at every time step
      teacher_force = random.random() < teacher_force_ratio
      best_guess = output.argmax(1) #grab the best prediction (with max proba)
      input = target[t] if teacher_force else best_guess
    return outputs
    

In [21]:
#Writing the training and validation loops for the model:


In [22]:
#Hyperparameters to be used in this model
input_dim = len(germany.vocab)
output_dim = len(english.vocab)
enc_embd = 300
dec_embd = 300
enc_hidden = 512
dec_hidden = 512
enc_dropout = 0.5
dec_dropout = 0.5

attn = Attention(enc_hidden, dec_hidden)
enc = Encoder(input_dim, enc_embd, enc_hidden, dec_hidden, enc_dropout)
dec = Decoder(output_dim, dec_embd,dec_hidden, enc_hidden, attn, dec_dropout)
model = MTL(enc, dec, device).to(device)

In [23]:
#Initialize the parameters to random-normal
def wt_initializer(model):
  for name, par in model.named_parameters():
    if 'weight' in name:
      nn.init.normal_(par.data, mean = 0, std = 0.01)
    else:
      nn.init.constant_(par.data, 0)
model.apply(wt_initializer)

MTL(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7855, 300)
    (rnn): GRU(300, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 300)
    (rnn): GRU(1324, 512)
    (fc): Linear(in_features=1836, out_features=5893, bias=True)
  )
)

In [24]:
#Count the number of trainable parameters in the model:
def par_counts(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"TOTAL trainable parameters in this model are: {par_counts(model):,} parameters")

TOTAL trainable parameters in this model are: 21,585,873 parameters


In [25]:
#get the loss and optimizers objects
lr = 1e-3
padded_idx = english.vocab.stoi[english.pad_token]
optimizer = optim.Adam(params = model.parameters(), lr = lr)
loss_obj = nn.CrossEntropyLoss(ignore_index = padded_idx)


In [26]:
#We now define our training loop using the following function:
def training_loop(model, iterator, optimizer, loss_obj, clip):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    input = batch.src
    target = batch.trg
    optimizer.zero_grad()
    preds = model(input, target)
    #target shape: [target_len, batch_size], preds shape: [target_len, batch_size, output_dim]
    output_dim = preds.shape[-1]
    #reshaping the predictions and select from the 2nd token (ignore 'sos' for loss computation)
    preds = preds[1:].view(-1, output_dim) # new shape = [(target_len-1) * batch_size, output_dim]
    target = target[1:].view(-1) # new shape = [(target_len - 1 * batch_size)]
    loss = loss_obj(preds, target)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return (epoch_loss/len(iterator))


In [27]:
#The validation loop is given by the following function:
def validation_loop(model, iterator, loss_obj):
  model.eval() #turn off regularizers
  epoch_loss = 0
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      input = batch.src
      target = batch.trg
      preds = model(input, target,0) #no teacher forcing
      output_dim = preds.shape[-1]
      preds = preds[1:].view(-1, output_dim)
      target = target[1:].view(-1)
      loss = loss_obj(preds, target)
      epoch_loss+=loss.item()
  return (epoch_loss / (len(iterator)))

In [None]:
#We now train the model for 100 epochs: We record both loss and exp(loss) = PPl
clip = 1
best_val_loss = float('inf') # to save best weights
num_epochs = 100
for epoch in range(num_epochs):
  tic = time.time()
  print(f"\n>>>>Training starts for epoch: {epoch + 1}\n>>>>Please wait and keep your screen active all the time.....")
  train_loss = training_loop(model, train_iter, optimizer, loss_obj, clip)
  valid_loss = validation_loop(model, validation_iter, loss_obj)
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss
    torch.save(model.state_dict(), 'mtl_prack.pt')
  toc = time.time()
  print(f"\n>>>>time elapsed for this epoch is: {time_fmt(toc - tic)}")
  print(f">>>>train loss: {float(train_loss):.4f}::PPL {math.exp(train_loss):7.4f}")
  print(f"\n>>>>validation loss: {float(valid_loss):.4f}::PPL {math.exp(valid_loss):7.4f}")
  


>>>>Training starts for epoch: 1
>>>>Please wait and keep your screen active all the time.....

>>>>time elapsed for this epoch is: 0: 01: 20.00
>>>>train loss: 5.2150::PPL 184.0026

>>>>validation loss: 5.0365::PPL 153.9240

>>>>Training starts for epoch: 2
>>>>Please wait and keep your screen active all the time.....

>>>>time elapsed for this epoch is: 0: 01: 24.00
>>>>train loss: 4.6178::PPL 101.2725

>>>>validation loss: 4.4201::PPL 83.1042

>>>>Training starts for epoch: 3
>>>>Please wait and keep your screen active all the time.....

>>>>time elapsed for this epoch is: 0: 01: 25.00
>>>>train loss: 4.0837::PPL 59.3656

>>>>validation loss: 3.6224::PPL 37.4286

>>>>Training starts for epoch: 4
>>>>Please wait and keep your screen active all the time.....

>>>>time elapsed for this epoch is: 0: 01: 27.00
>>>>train loss: 3.5194::PPL 33.7641

>>>>validation loss: 3.2377::PPL 25.4750

>>>>Training starts for epoch: 5
>>>>Please wait and keep your screen active all the time.....

>>>>