<a href="https://colab.research.google.com/github/martinpius/PYTORCH/blob/main/Machine_Translation_A_seq2Seq_Model_with_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
try:
  COLAB = True
  import torch
  print(f"You are on CoLaB with torch version: {torch.__version__}")
except Exception as e:
  COLAB = False
  print(f"{type(e)}: {e}\n>>>>Please correct {type(e)} and re-load your drive")
def time_fmt(t:float = 123.842)->float:
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) / 60)
  s = int(t%60)
  return f"{h}: {m:>02}: {s:>05.2f}"
print(f">>>>time formating:\tplease wait....\n>>>>time elapsed\t:{time_fmt()}")
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

Mounted at /content/drive
You are on CoLaB with torch version: 1.8.1+cu101
>>>>time formating:	please wait....
>>>>time elapsed	:0: 02: 03.00


In [2]:
#In this notebook we are going to train a Machine translation model
#This is a typical sequence to sequence model (where the input is a sequence and output is another sequence)
#The architecture of this model is a typical encoder-decoder
#We will apply Multi30k datasets from torchtext (using only germany and english columns)

In [3]:
import torch, spacy, time, random, sys,math
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator #For data preprocessing
import numpy as np#to set-up the device
import torch.optim as optim
import torch.nn as nn

In [4]:
#Setup the gpu device to avoid cuda error!!!
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [5]:
#We start by installing the tokenizers to be used in pre-processing our texts
!python -m spacy download en_core_web_sm #For english
!python -m spacy download de_core_news_sm #For germany

#Loading the tokenizer ready to be used
spacy_de = spacy.load('de_core_news_sm') # Restart the kernel to run it effectively
spacy_en = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [6]:
#The following simple functions will take the raw texts and return the list of tokens.
#Example: [I love eating -->>> 'I' 'love' 'eating']

def tokenize_de(text):
  #We also reverse the input sentence to make the model predict more for unseen data (not necessarily mandatory)
  return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [7]:
#We preprocess the text by applying the above functions, also we mark the start and end of each sentence
#To know where to start and where to terminate the predictions ['start of sentence = sos', end of sentence = eos]
#We use Field method to perform this tasks(Field also has more parameters (like lower = Bool), etc to add whenever necessary)

germany_ln = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

english_ln = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [8]:
#We first load our data using Multi30k class (This may take some time to download depending with your internet speed)
tic = time.time()
print(f"Please wait while the data is downloaded to CoLaB.....")
train_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'),fields = (germany_ln, english_ln))
toc = time.time()
print(f"Time elapsed: {time_fmt(toc - tic)}")

Please wait while the data is downloaded to CoLaB.....
downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:04<00:00, 284kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 91.9kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 85.9kB/s]


Time elapsed: 0: 31: 31.00


In [9]:
#We build vocabulary for only training data to avoid falacy during validation and testing
#We keep those tokens which occurs at least twice
germany_ln.build_vocab(train_data, min_freq = 2)
english_ln.build_vocab(train_data, min_freq = 2)

In [10]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
#We can then build our iterator that will be used to stream in our model ready for training
#We also alocate the iterator to the gpu when available
batch_size = 64
train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), 
    batch_size = batch_size, 
    device = device)

In [12]:
#We build our model in 3 stages: Encoder, Decoder and AutoEncoder(Model)


In [13]:
#The Encoder Class: Is the ussual RNN with 2 LSTM layers
#The input dimension = len(germary_vocabulary), We consider 
#The embedding layer of 300 dimension and the hidden size of 1024 neurons
#The return from this class is the context vectors(hidden and cell states)
#To be used as additional inputs to our decoder network later on

class Encoder(nn.Module):
  def __init__(self, input_dim, embd_dim, hidden_size, num_layers, dropout):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_dim, embd_dim)
    self.rnn = nn.LSTM(embd_dim, hidden_size, num_layers, dropout = dropout)
  
  def forward(self, input_tensor):
    #input_tensor_shape: [len_germany_vocab, batch_size]
    x = self.dropout(self.embedding(input_tensor))
    output, (hidden, cell) = self.rnn(x)
    #Hidden, cell shape = [num_layers, batch_size, hidden_dim]
    return hidden, cell


In [14]:
#The decoder class: Is also an rnn with 2 LSTM layers that takes
#inputs of size [1-token at a time] and context vector from the encoder
#We have an additional fully connected layer for the output
#The output of this class will be the predictions, cell and hidden states
#To be fed to the next prediction as input (since we will predict one token at a time)

class Decoder(nn.Module):
  def __init__(self, output_dim, embd_dim, hidden_size, num_layers, dropout):
    super(Decoder, self).__init__()
    self.output_dim = output_dim
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(output_dim, embd_dim)
    self.rnn = nn.LSTM(embd_dim, hidden_size, num_layers, dropout = dropout)
    self.fc = nn.Linear(hidden_size, output_dim)
  
  def forward(self, input_tensor, hidden, cell):
    #input tensor shape [batch_size]
    #We need to make sure we enter one token at a time
    input_tensor = input_tensor.unsqueeze(0) #Force the shape to be [1,batch_size]
    embedded = self.dropout(self.embedding(input_tensor)) #shape = (1, batch_size, emb_dim)
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) # output shape: (1, batch_size, hidden_dm*1d))
    predictions = self.fc(output.squeeze(0))# predictions (unsqueezed = [batch_size, hidden_dim]) so we squeeze back to [batch_size]
    return predictions, hidden, cell

In [15]:
#We are ready to construct the model class by simply linking the two classes above
class AutoEncoder(nn.Module):
  def __init__(self, encoder, decoder, device):
    super(AutoEncoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
  
  def forward(self, input_tensor, target, teacher_forcing_ratio = 0.5):
    '''
    this method will build a prediction with the help of teacher-forcing
    technique(we provide the ground truth 50% chance as input to the next 
    time stamp) else we feed the predicted output as input for the next token
    prediction. if prob = 1 then the model wont be able to generalize for the test/validation
    '''
    batch_size = target.shape[1] # we know the shape of target = [len(english_voc), batch_size]
    target_len = target.shape[0] #target = [len(english_voc), batch_size]
    target_voc_size = self.decoder.output_dim #we grab the dimension of target (len(english_vocabulary)
    outputs = torch.zeros(target_len, batch_size, target_voc_size).to(self.device) #Container for the predictions
    hidden, cell = self.encoder(input_tensor) #We simply run the encoder to get the context for the initial input to decoder
    input = target[0,:] # grab one token at a time for the decoder
    for t in range(1, target_len):
      output, hidden, cell = self.decoder(input, hidden, cell) #We now run the decoder for every token until the end os a sentence
      outputs[t] = output #assigning to the container
      #Applying teacher forcing if the condition is satisfied else use best guess = previous output
      teacher_force = random.random() < teacher_forcing_ratio
      best_guess = output.argmax(1)
      input = target[t] if teacher_force else best_guess
    return outputs



In [16]:
#Hyper-parameters for the Model
input_dim = len(germany_ln.vocab)
output_dim = len(english_ln.vocab)
enc_embd_dim = 256
dec_embd_dim = 256
hidden_size = 512
num_layers = 2
enc_dropout, dec_dropout = 0.5, 0.5

In [17]:
enc = Encoder(input_dim,enc_embd_dim,hidden_size,num_layers, enc_dropout)
dec = Decoder(output_dim, dec_embd_dim,hidden_size, num_layers, dec_dropout)

model = AutoEncoder(enc, dec, device).to(device)

In [18]:

#We initialize all parameters to uniform distribution in range[-0.08, 0.08]
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights) #apply the function to initialize the parameters of the model to a uniform distribution

AutoEncoder(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=5893, bias=True)
  )
)

In [19]:
#Get the optimizer and loss objects
#We exclude padded indices (we are not going to incur any cost due to padding)
optimizer = optim.Adam(model.parameters())
pad_idx = english_ln.vocab.stoi[english_ln.pad_token] #grab all padded indices
loss_obj = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [23]:
#The training function help us for the training loop
def train(model, iterator, optimizer, loss_obj, clip):
  model.train() #to activate the regularizer layers such as batchnorm/dropout
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    src = batch.src #grab the input tensor (related to germany tokens)
    trg = batch.trg #grab the target tensor (related to english token)
    optimizer.zero_grad() # initialize the grads for the wt to zeros
    output = model(src, trg) #Ussual forward pass 
    output_dim = output.shape[-1] # output_shap: [1,batch, target_dim]
    output = output[1:].view(-1, output_dim) #grab from 2nd token and shrink the dimension to [1*batch, target_dim]
    trg = trg[1:].view(-1) #do the same as above (reshaping the target)
    loss = loss_obj(output, trg) #compute train loss
    loss.backward() #backward pass as ussual
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)#cliping the gradient to avoid explosion 
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)

In [24]:
#Evaluation loop
def evaluate(model, iterator, loss_obj):
  model.eval() # turn off regularizers
  epoch_loss = 0
  with torch.no_grad():#No need to re-compute the gradient again
    for i, batch in enumerate(iterator):
      src = batch.src
      trg = batch.trg
      output = model(src, trg, 0) #turn off teacher forcing
      output_dim = output.shape[-1]
      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)
      loss = loss_obj(output, trg)
      epoch_loss += loss.item()
  return epoch_loss / len(iterator)

In [None]:
#We finally training our model for 100 epochs 
num_epochs = 100
clip = 1 #for clipping the gradient
best_valid_loss = float('inf') #to check if an epoch has achieved a best validation loss
for epoch in range(num_epochs):
  tic = time.time()
  train_loss = train(model, train_iterator, optimizer,loss_obj,clip)
  valid_loss = evaluate(model, validation_iterator, loss_obj)
  toc = time.time()
  epoch_time = time_fmt(toc-tic)
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'mtl_model.pt')
    
  print(f'Epoch: {epoch+1:02} | Time elapsed: {epoch_time}')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\tValidation. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time elapsed: 0: 01: 32.00
	Train Loss: 4.172 | Train PPL:  64.825
	Validation. Loss: 4.514 |  Val. PPL:  91.329
Epoch: 02 | Time elapsed: 0: 01: 32.00
	Train Loss: 3.879 | Train PPL:  48.384
	Validation. Loss: 4.317 |  Val. PPL:  74.945
Epoch: 03 | Time elapsed: 0: 01: 31.00
	Train Loss: 3.668 | Train PPL:  39.173
	Validation. Loss: 4.096 |  Val. PPL:  60.105
Epoch: 04 | Time elapsed: 0: 01: 32.00
	Train Loss: 3.455 | Train PPL:  31.667
	Validation. Loss: 4.007 |  Val. PPL:  55.003
Epoch: 05 | Time elapsed: 0: 01: 31.00
	Train Loss: 3.276 | Train PPL:  26.469
	Validation. Loss: 3.922 |  Val. PPL:  50.508
Epoch: 06 | Time elapsed: 0: 01: 32.00
	Train Loss: 3.128 | Train PPL:  22.838
	Validation. Loss: 3.835 |  Val. PPL:  46.284
Epoch: 07 | Time elapsed: 0: 01: 31.00
	Train Loss: 2.996 | Train PPL:  20.007
	Validation. Loss: 3.769 |  Val. PPL:  43.337
Epoch: 08 | Time elapsed: 0: 01: 32.00
	Train Loss: 2.871 | Train PPL:  17.654
	Validation. Loss: 3.810 |  Val. PPL:  45.168
