<a href="https://colab.research.google.com/github/martinpius/PYTORCH/blob/main/Machine_translation_with_Attention_Pytorch_implementantion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import drive
drive.mount(f"/content/drive", force_remount = True)
try:
  COLAB = True
  import torch 
  print(f"You are on CoLaB with torch version: {torch.__version__}")
except Exception as e:
  print(f"{type(e)}: {e}\n>>>>>please correct {type(e)} and re-load your device")
  COLAB = False
def time_fmt(t: float = 231.718)->float:
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) / 60)
  s = int(t % 60)
  return f"{h}: {m:>02}: {s:>05.2f}"
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
print(f">>>>>time testing:\tplease wait....\n>>>>time elapsed:\t{time_fmt()}")

Mounted at /content/drive
You are on CoLaB with torch version: 1.8.1+cu101
>>>>>time testing:	please wait....
>>>>time elapsed:	0: 03: 51.00


In [10]:
#In this notebook we are going to train a neural machine translation using RNN
#The model architecture involves attention mechanism.


In [224]:
#The dataset to be used for this model comes from torchtext (Multi30k) class:
import torch, random, time, spacy, sys, math
import torch.optim as optim
import torch.nn as nn
import numpy as np
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

In [12]:
#Set the device to avoid cuda errors:
seed = 2324
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [13]:
#Installing and loading the tokenizers for data pre-processing
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
#Loading the tokenizers
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm') #restart the kernel for effective loading

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [14]:
#Functions to perform tokenization with the aid of the tokenizers above and we also
#reversing the sentences for the case of germany language to improve generalization
def en_tokenizer(text):
  return  [tok.text for tok in spacy_en.tokenizer(text)]
def de_tokenizer(text):
  return [tok.text for tok in spacy_de.tokenizer(text)][::-1]


In [15]:
#Apply the above function to preprocess the data by tokenize them, 
#adding start and end of the sentence,and lower cases each token
germany_ln = Field(tokenize = de_tokenizer, lower = True, init_token = '<sos>', eos_token = '<eos>',include_lengths = True)
english_ln = Field(tokenize = en_tokenizer, lower = True, init_token = '<sos>', eos_token = '<eos>')

In [16]:
#We can now load and preprocess our data using Multi30k class:
#The download and preprocessing may take a while depend with internet and computing power:
tic = time.time()
print(f">>>>Please wait while the data is downloaded to CoLaB......")
train_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (germany_ln, english_ln))
toc = time.time()
print(f">>>>Time elapsed for downloading and preprocessing the data is: {time_fmt(toc - tic)}")

>>>>Please wait while the data is downloaded to CoLaB......
>>>>Time elapsed for downloading and preprocessing the data is: 0: 24: 46.00


In [17]:
#We can also print number of examples in each dataset to see if we have downloaded the correct files
print(f"num_train_examples: {len(train_data.examples)}\nnum_valid_examples: {len(validation_data.examples)}\nnum_test_examples: {len(test_data.examples)}")

num_train_examples: 29000
num_valid_examples: 1014
num_test_examples: 1000


In [18]:
#We can also investigate a single data example to see if the sentences are well tokenized:
print(f"sample token list: {vars(train_data.examples[0])}") #The first example in our training data

sample token list: {'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [19]:
#We build-up vocabulary list for our training data only to avoid falacy in validation and test set
germany_ln.build_vocab(train_data, min_freq = 2) #We take those tokens which repeated for at least 2 times
english_ln.build_vocab(train_data, min_freq = 2) 

In [20]:
#We can finally build our iterator that will be streamed into our model later during training-validation stage
batch_size = 64
train_iter, validation_iter, test_iter = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

In [21]:
#Now we are good to go for the model Building. The Encoder is a ussual rnn with GRU architecture 
#With modification in the call method to force the model to accept information from both tokens 
#and length of each sentences

In [204]:
#This class will use one Bi-directional GRU layer for encoder: Note that shape for output of encoder
#Is the same as hidden layer size for decoder
class Encoder(nn.Module):
  def __init__(self, input_dim, embd_dim, enc_hidden, dec_hidden, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embd_dim)
    self.rnn = nn.GRU(embd_dim, enc_hidden, bidirectional = True)
    self.fc =  nn.Linear(2*enc_hidden, dec_hidden)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, input_len):
    '''
    Note: input dim = [input_len = germany_vocab_len, batch_size]
    input_len dim = [batch_size]
    '''
    embedded = self.dropout(self.embedding(input)) # embedded shape = [input_len, batch_size, embd_dim]
    #rnn.pack_padded_seq is used to produce pack-padded sequence of the padded imbedded sequence
    packed_embd = nn.utils.rnn.pack_padded_sequence(embedded, input_len.to('cpu'))#We exclusively store sentences length to cpu
    #The packed output = packed sequence that consists of all hidden states
    packed_output, hidden = self.rnn(packed_embd) #Here hidden is from non-padded sequence of the batch
    #we then run rnn_pad_packed_seq to with the packed input to  unpack 
    #Outputs are unpacked states where the padded inputs are now not considered in computation (all zeros)
    output, _ = nn.utils.rnn.pad_packed_sequence(packed_output) #shape for the output [seq_len, batch, hidden*2(bidirectional)]
    #We neeed to concatenate output of both forwrd direction and backward direction rnn hidden states
    #for the last element of backward direction rnn = [-1,:,:] and for the forward direction rnn: [-2,:,:]
    #Initial decoder hidden size correspond to the output of linear layer that use the concatenated hidden layers above
    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
    #output shape = [input_len, batch_size, 2*hidden], hidden_dim = [batch_size, dec_hidden_dim]
    return output, hidden


In [205]:
#We now build the attention mechanism to be utilized by our network
#In this case we do not pay attention to the padded indices (zeros)
#We force the attention to be over only real tokens (ignore padding effect) 
#using a mask. We supply a mask of shape [batch_size, len_sentence] 
#which take 1 if no padded and zero else where is supplied as additional
#details to the foward method of our attention class.


In [206]:
class Attention(nn.Module):
  def __init__(self, enc_hidden_size, dec_hidden_size):
    super().__init__()
    self.attn = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
    self.v = nn.Linear(dec_hidden_size, 1, bias = False)
  
  def forward(self, dec_hidden,enc_outputs, mask):
    '''
    Note that: dec_hidden_dim = [batch_size, dec_hidden_dim]
    enc_outputs_dim = [input_len, batch_size, enc_hidden*2]
    '''
    batch_size = enc_outputs.shape[1]
    input_len = enc_outputs.shape[0]
    #Since decoder prev is merged with enc_hidden we have to repeat it times the len_input to equalize the shape
    dec_hidden = dec_hidden.unsqueeze(1).repeat(1, input_len,1)
    enc_outputs = enc_outputs.permute(1,0,2) #to allow dot-product we interchange first and 2nd dim for the encoder out
    #enc_outputs_shape: [batch_size, input_len, enc_hidden*2], dec_hidden_shape = [batch_size, input_len, dec_hidden_dim]
    e_values = torch.tanh(self.attn(torch.cat((dec_hidden, enc_outputs),dim = 2))) #shape = [batch_size, input_len, dec_hidden]
    attention = self.v(e_values).squeeze(2) #Shape = [batch_size, input_len]
    #Now we apply the mask to the attention before the softmax transformation
    attention = attention.masked_fill(mask == 0, 1e-10)
    return nn.functional.softmax(attention, dim = 1) #accros the input_len dimension (the sentence entered)


In [207]:
#The decoder Network is the typical rnn with GRU architecture but
#We write the class in such a way that it will accept the mask over the input_sentence and pass
#it to the attention class to compute by ignore the indices for the padded tokens
#We also returns the attention tensor for vissualization later during inference


In [208]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
    super().__init__()
    self.output_dim = output_dim
    self.attention = attention
    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
    self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, input, hidden, encoder_outputs, mask):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    a = self.attention(hidden, encoder_outputs, mask)
    a = a.unsqueeze(1)
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    weighted = torch.bmm(a, encoder_outputs)
    weighted = weighted.permute(1, 0, 2)
    rnn_input = torch.cat((embedded, weighted), dim = 2)
    output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
    assert (output == hidden).all()
    embedded = embedded.squeeze(0)
    output = output.squeeze(0)
    weighted = weighted.squeeze(0)
    prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
    return prediction, hidden.squeeze(0), a.squeeze(1)

In [209]:
#We now combine the above classes (encoder , decoder) to build our seq2seq model:
class AutoEncoder(nn.Module):
  def __init__(self, encoder, decoder,input_pad_idx, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.input_pad_idx = input_pad_idx
    self.device = device
  
  def mask_build(self, input):
    '''
    mask will have 1 if no padded index and 0 otherwise
    '''
    mask = (input != self.input_pad_idx).permute(1,0)
    return mask
  
  def forward(self, input, input_len, target, teacher_force_ratio = 0.5):
    '''
    We are making the ussage of teacher-forcing technique to input the ground 
    truth 50% of times and the predicted token otherwise at each time stamp
    input = source sentence = germany: shape = [input_len = len(germany_vocab), batch_size]
    input_len = length of input = sequence length : shape = [batch_size]
    target = english: shape = [target_len, batch_size]
    '''
    target_len = target.shape[0]
    batch_size = input.shape[1]
    target_voc_size = self.decoder.output_dim
    #Create container to hold the predictions and assign it to the gpu if available
    outputs = torch.zeros(target_len, batch_size, target_voc_size).to(self.device)
    enc_outputs, hidden = self.encoder(input, input_len) #Run the encoder
    #Grab the first token to enter the decoder (first input = '<sos>' token)
    dec_input = target[0,:]
    #get the masked input(general)
    mask = self.mask_build(input) #shape = [batch_size, input_len]
    #We now iterate over the target sentence (english) to do prediction one token at a time
    for t in range(1, target_len):
      #Here we insert in embeded input token,prev_state, enc_hidden(2),mask-->prediction, new hidden state
      output, hidden, _ = self.decoder(dec_input, hidden, enc_outputs, mask)
      outputs[t] = output #store the prediction at every time step
      #We make decission if we will use teacher forcing tech or ground truth
      teacher_force = random.random() < teacher_force_ratio
      #Get the best guess from the output (target)
      best_guess = output.argmax(1)
      dec_input = target[t] if teacher_force else best_guess
    return outputs


In [210]:
#We can now build the training and validation loops for the above model

In [211]:
#Hyperparameters
input_dim = len(germany_ln.vocab)
output_dim = len(english_ln.vocab)
enc_embd_dim = 256
dec_embd_dim = 256
enc_hidden = 512
dec_hidden = 512
enc_dropout = 0.5
dec_dropout = 0.5
learning_rate = 1e-3
input_pad_idx = germany_ln.vocab.stoi[germany_ln.pad_token] #Get all indices for the pad-token
attn = Attention(enc_hidden, dec_hidden)
encoder = Encoder(input_dim, enc_embd_dim,enc_hidden,dec_hidden,enc_dropout)
decoder = Decoder(output_dim, dec_embd_dim, enc_hidden,dec_hidden,dec_dropout,attn)
model = AutoEncoder(encoder, decoder,input_pad_idx, device).to(device)

In [212]:
#We initialize the parameters of the model(both trainable-->normal dist), and non trainable = zeros
def wt_initializer(m):
  for name, param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data, mean = 0.0, std = 0.01)
    else:
      nn.init.constant_(param.data, 0)

In [213]:
model.apply(wt_initializer)

AutoEncoder(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [214]:
#We may count number of trainable parameters in the model using the following function:
def count_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total number of trainable parameters in this model is: {count_params(model):,}")

Total number of trainable parameters in this model is: 20,518,917


In [215]:
#Get the optimizer and loss objects: for loss functions we are not going to encurs cost for padded indices
pad_idx = english_ln.vocab.stoi[english_ln.pad_token] #grab those indices related to padding tokens
loss_obj = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(params = model.parameters(), lr = learning_rate)

In [216]:
print(model)

AutoEncoder(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [217]:
#The training loop is aided by the following function:
def train_loop(model, iterator, optimizer, loss_obj, clip):
  model.train() #Turn on the regularization layers such as batch-norm and dropout
  loss_per_epoch = 0
  for i, batch in enumerate(iterator):
    #we fetch input and target from the batch item
    input,input_len = batch.src
    target = batch.trg
    optimizer.zero_grad() #initialize the grads to zero
    output = model(input, input_len, target)
    #target shape: [target_len, batch_size], output_dim = [target_len, batch_size, output_dim]
    output_dim = output.shape[-1]
    #reshape the prediction before use for cross-entropy loss: (also skip the first token (sos))
    output = output[1:].view(-1, output_dim) #View will reshape for us = [target_len-1*batch, output_dim]
    target = target[1:].view(-1) #shape: [target_len-1 * batch_size]
    loss = loss_obj(output, target)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters, clip)
    optimizer.step()
    loss_per_epoch += loss.item()
  return (loss_per_epoch/len(iterator))


In [218]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    src, src_len = batch.src
    trg = batch.trg
    optimizer.zero_grad()
    output = model(src, src_len, trg)
    output_dim = output.shape[-1]
    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)

In [222]:
#Evaluation loop will be done through the following function
def validation_loop(model, iterator, loss_obj):
  model.eval()#turn off the regularization layers
  loss_per_epoch = 0
  #We do not train the model again
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      input, input_len = batch.src
      target = batch.trg #shape is: [target_len, batch_size]
      #we add 0 to turn-off teacher forcing
      output = model(input, input_len, target, 0)# shape: [target_len, batch_size, output_dim]
      output_dim = output.shape[-1] # grab the output dimension
      #Reshape the output to compute cross-entropy loss: Also ignore the first token
      output = output[1:].view(-1, output_dim) #shape = [target_len-1 * batch_size, output_dim]
      target = target[1:].view(-1)# shape: [target_len-1* batch_size]
      #In cross entropy loss we only need dim-0 for the predictor and actual to be the same dim
      loss = loss_obj(output, target)
      loss_per_epoch += loss.item()
  return (loss_per_epoch/len(iterator))




In [225]:
#Finaly we train our network as follows:
num_epochs = 100
clip = 1
best_validation_loss = float('inf')
for epoch in range(num_epochs):
  print(f"\n>>>>Training start for epoch {epoch + 1}\n>>>>Please wait while model is training....")
  tic = time.time()
  train_loss = train(model, train_iter, optimizer, loss_obj, clip)
  validation_loss = validation_loop(model, validation_iter, loss_obj)
  toc = time.time()
  #saving the best weights
  if validation_loss < best_validation_loss:
    best_validation_loss = validation_loss
    torch.save(model.state_dict(),'translation_with_attention.pt')
  print(f"\n>>>>Epoch: {epoch + 1}: Time elapsed: {time_fmt(toc - tic)}")
  print(f"\n>>>>Training loss: {float(train_loss):.4f}\n>>>>Validation loss: {float(validation_loss):.4f}")
  print(f"\n>>>>Training PPL: {math.exp(train_loss):7.4f}\n>>>>Validation PPL: {math.exp(validation_loss):7.4f}")




>>>>Training start for epoch 1
>>>>Please wait while model is training....

>>>>Epoch: 1: Time elapsed: 0: 00: 39.00

>>>>Training loss: 2.6738
>>>>Validation loss: 3.2985

>>>>Training PPL: 14.4944
>>>>Validation PPL: 27.0711

>>>>Training start for epoch 2
>>>>Please wait while model is training....

>>>>Epoch: 2: Time elapsed: 0: 00: 39.00

>>>>Training loss: 2.2689
>>>>Validation loss: 3.1946

>>>>Training PPL:  9.6687
>>>>Validation PPL: 24.3999

>>>>Training start for epoch 3
>>>>Please wait while model is training....

>>>>Epoch: 3: Time elapsed: 0: 00: 39.00

>>>>Training loss: 1.9632
>>>>Validation loss: 3.2423

>>>>Training PPL:  7.1223
>>>>Validation PPL: 25.5914

>>>>Training start for epoch 4
>>>>Please wait while model is training....

>>>>Epoch: 4: Time elapsed: 0: 00: 39.00

>>>>Training loss: 1.7235
>>>>Validation loss: 3.2237

>>>>Training PPL:  5.6039
>>>>Validation PPL: 25.1197

>>>>Training start for epoch 5
>>>>Please wait while model is training....

>>>>Epoch: 