<a href="https://colab.research.google.com/github/martinpius/MachineTranslation/blob/main/Neural_Machine_Translation_From_Scratch_With_Pytorch_Application_on_Multi30k_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import torch
  print(f">>>> You are on Google CoLaB with Pytorch version {torch.__version__}")
except Exception as e:
  print(f">>>> {type(e)} {e}\n>>>> please correct {type(e)} and reload your drive")
  COLAB = False

def time_fmt(t: float = 123.981)->float:
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) /  60)
  s = int(t % 60)
  return f"hrs: {h} min: {m:>02} sec: {s:05.2f}"
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print(f">>>> testing the time formating function........\n>>>> time elapsed\t{time_fmt()}")

Mounted at /content/drive
>>>> You are on Google CoLaB with Pytorch version 1.9.0+cu102
>>>> testing the time formating function........
>>>> time elapsed	hrs: 0 min: 02 sec: 03.00


In [None]:
# In this notebook we are going to build a simple Google translator and train it on Multi30k dataset
# We focus on translating germany sentences to english. This network is a ussual decoder-encoder architecture
# which apply rnn with an LSTM architecture. 

In [None]:
import torch, spacy, math
from torch import nn
from torch import optim
import numpy as np
import pandas as pd
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import time, random, sys, datetime
from tqdm import tqdm
from tensorflow import summary
%load_ext tensorboard


In [None]:
# install and load the spacy tokenizer for both languages
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
eng_tokenizer = spacy.load("en_core_web_sm")
ger_tokenizer = spacy.load("de_core_news_sm")


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [None]:
# set the seed values for reproducability and also set the GPU to deterministic to improve training:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
# We start by loading and preprocessing the data to be used in training our network:
# we first define a function for performing tokenization
def germany_tokenizer(text):
  return [tok.text for tok in ger_tokenizer.tokenizer(text)]

def english_tokenizer(text):
  return [tok.text for tok in eng_tokenizer.tokenizer(text)]
# we define the field objects to be used to preprocess our text
# we change the sentences to lower cases, adding the initial and end tokens then tokenize everything.
english = Field(sequential = True, use_vocab = True, tokenize = english_tokenizer, init_token = "<sos>", eos_token = "<eos>", lower = True)
germany = Field(sequential = True, use_vocab = True, tokenize = germany_tokenizer, init_token = "<sos>", eos_token = "<eos>", lower = True)

# we can now load the dataset using the multi30k library/module as follow.
tic = time.time()
train_data, validation_data, test_data = Multi30k.splits(exts = (".de", ".en"), fields = (germany, english))
print(f">>>> number of train examples: {len(train_data.examples)}\n>>> number of validation examples: {len(validation_data.examples)}\n>>>> number of test examples: {len(test_data.examples)}")
print(f">>> the first paired sample in the train data: {vars(train_data.examples[0])}")
toc = time.time()
print(f">>>> time elapsed for loading and preprocessing the data is : {time_fmt(toc - tic)}")

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 971kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 164kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 161kB/s]


>>>> number of train examples: 29000
>>> number of validation examples: 1014
>>>> number of test examples: 1000
>>> the first paired sample in the train data: {'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}
>>>> time elapsed for loading and preprocessing the data is : hrs: 0 min: 00 sec: 09.00


In [None]:
# build the vocabulary for the training data only.
# we use the field object we have defined above to get the vocabularies
# we only consider the words which repeats for 2 or more times in a sentence
english.build_vocab(train_data, min_freq = 2)
germany.build_vocab(train_data, min_freq = 2)


In [None]:
# We can now build our iterator object to stream in the data during training:
batch_size = 128
train_iter, valid_iter, test_iter = BucketIterator.splits(
    datasets = (train_data, validation_data, test_data),
    batch_size = batch_size
)
train_batch = next(iter(train_iter))
print(f">>>> the first train batch is: {train_batch}")

>>>> the first train batch is: 
[torchtext.legacy.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.LongTensor of size 23x128]
	[.trg]:[torch.LongTensor of size 21x128]


In [None]:
# Our data is ready to be used to train a machine translation network. We now build our network step by step

In [None]:
# The encoder: This network is the rnn with 2 LST layers
class Encoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
    super(Encoder, self).__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.dropout = nn.Dropout(dropout)
    self.lstm = nn.LSTM(input_size = embedding_dim,
                        hidden_size = hidden_dim,
                        num_layers = num_layers,
                        dropout = dropout)
  
  def forward(self, input_tensor):
    '''
    the input tensor has the shape [seq_len, batch_size]
    embedding_dim has the shape [seq_len, batch_size, embeding_dim]
    the output of lstm include output(not needed), [cell_state, hidden_state]
    cell_state and hidden-state will be used as input to the decoder later on
    '''
    embedding = self.dropout(self.embedding(input_tensor))
    out, (hidden_state, cell_state) = self.lstm(embedding)
    return hidden_state, cell_state


In [None]:
# The Decoder's network is also an LSTM with two layers
# the hidden dimension is the same as in the encoder
# in the forward its uses outputs from the encoder as input
# the output of the decoder is a softmax prob with the chances
# of occurance of the respective words
class Decoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers,output_dim, dropout):
    super(Decoder, self).__init__()
    self.hidden_dim = hidden_dim
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.lstm = nn.LSTM(input_size = embedding_dim,
                        hidden_size = hidden_dim,
                        num_layers = num_layers,
                        dropout = dropout)
    self.fc = nn.Linear(in_features = hidden_dim, out_features = output_dim)
  
  def forward(self, input_tensor, cell_state, hidden_state):
    '''
    the decoder will utilize outputs of the encoder (the context vector) as input
    therefore the cell-state and the hidden state from the encoder will be used here
    note that the input tensor has shape [seq_len, batch]
    here we need to enter one token at a time to the decoder and hence
    input_tensor is reshaped to be [batch_size, 1]
    embedding_shape will be [batch_size, 1, embedding_dim]
    dec_out will be of shape: [1, batch_size, dec_hidden]
    predictions shape === [1, batch_size, len(target==english vocabulary)]
    '''
    input_tensor = input_tensor.unsqueeze(0) # adding a dimension to hold one input at a time
    embedding = self.dropout(self.embedding(input_tensor))
    dec_out, (hidden_state, cell_state) = self.lstm(embedding, (hidden_state, cell_state))
    predictions = self.fc(dec_out) # get the prob values for each token
    predictions = predictions.squeeze(0) # get rid-off first dimension==new shape [batch_size, targ_length]
    return predictions, cell_state, hidden_state



In [None]:
# we now combine the encoder and the decoder classes to build our final network
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self, source, target, teacher_force_ratio = 0.5):
    '''
    We are going to train this network based on teacher-force ratio technique
    - with the probability of 0.5 we feed the correct label to the decoder network to predict
    the next token. Also we feed the previous predicted token with the probability of 0.5 to
    the decoder to predict the next token
    '''
    batch_size = source.shape[1] # grab the batch size
    target_dim = target.shape[0] # grab the target dim
    target_vocab_len = len(english.vocab) # get the length of vocabulary
    # create a container to hold the outputs probs
    outputs = torch.zeros(target_dim, batch_size, target_vocab_len).to(device = device)
    # run the encoder on source language to obtain the previous hidden and cell states
    hidden_state, cell_state = self.encoder(source)
    x = target[0] # grab the "<SOS>" token from the target
    for t in range(1, target_dim):
      # run the decoder here
      out, hidden_state, cell_state = self.decoder(x, hidden_state, cell_state)
      outputs[t] = out
      # grab the highest probable token
      best_guess = out.argmax(1)
      # apply teacher-force technique with the probability of 0.5
      x = target[t] if random.random() < teacher_force_ratio else best_guess
    return outputs


In [None]:
# Hyper-parameters of the network
EPOCHS = 30
learning_rate = 1e-3
hidden_dim = 1024
embedding_dim = 300
dropout = 0.5
num_layers = 2
enc_input_dim = len(germany.vocab)
dec_input_dim = len(english.vocab)
dec_output_dim = len(english.vocab)

In [None]:
# instantiating the model class
encoder = Encoder(enc_input_dim, embedding_dim,hidden_dim, num_layers,dropout).to(device = device)
decoder = Decoder(dec_input_dim, embedding_dim, hidden_dim, num_layers, dec_output_dim, dropout).to(device = device)
model = Seq2Seq(encoder, decoder).to(device = device)
print(f">>>>encoder's graph\n\n: {encoder}\n\n decoder's graph:\n\n: {decoder}\n\nmodel's graph:\n\n{model}")

>>>>encoder's graph

: Encoder(
  (embedding): Embedding(7855, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
)

 decoder's graph:

: Decoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(5893, 300)
  (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=5893, bias=True)
)

model's graph:

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 300)
    (dropout): Dropout(p=0.5, inplace=False)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 300)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=5893, bias=True)
  )
)


In [None]:
# Get the loss and optimizer's objects to train the network
pad_idx = english.vocab.stoi["<pad>"] # we need to not penalize the network due to padding tokens
loss_obj = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(params = model.parameters(), lr = learning_rate)


In [None]:
# Get the writer for the tensorboard summary printing
current_time = datetime.datetime.now().timestamp()
my_dir = "logs/tensorboard/MTL_multi30k/"+ str(current_time)
writer = summary.create_file_writer(my_dir)


In [None]:
# the training loop:
tic = time.time()
step = 0
for epoch in range(EPOCHS):
  print(f"\n>>>> training starts for epoch {epoch + 1}\n>>>> training please wait................................")
  epoch_loss = 0
  for idx, batch in enumerate(tqdm(train_iter)):
    source = batch.src.to(device = device) # grabbing the source language == germany
    target = batch.trg.to(device = device) # grabbing the target language == english
    output = model(source, target) # shape == [len(english_voc, batch, output_dim)]
    # to use cross-entropy object we need to reshape the predictions
    output = output[1:].reshape(-1, output.shape[2]) # we also eliminate the sos token
    target = target[1:].reshape(-1)
    optimizer.zero_grad()
    loss = loss_obj(output, target)
    loss.backward()
    #clipping the gradients to avoid explosion
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
    optimizer.step()
    epoch_loss+=loss.item()
    mean_loss = f"{epoch_loss/len(train_iter):.4f}"
    train_PPL = f"{math.exp(loss):7.4f}"
    if idx % 100 == 0:
      print(f"\n>>>> end of training for epoch {epoch + 1}, train loss: {mean_loss}, train PPL: {train_PPL}")
      with writer.as_default():
        summary.scalar("train_loss", loss.item(), step = step)
        step+=1
toc = time.time()
print(f"\n>>>> time elapse for training a simple google translator in 20 epochs: {time_fmt(toc - tic)}")
%tensorboard --logdir logs/tensorboard





  0%|          | 0/227 [00:00<?, ?it/s]


>>>> training starts for epoch 1
>>>> training please wait................................


  0%|          | 1/227 [00:22<1:23:21, 22.13s/it]


>>>> end of training for epoch 1, train loss: 0.0382, train PPL: 5860.0914


 44%|████▍     | 101/227 [37:23<51:15, 24.40s/it]


>>>> end of training for epoch 1, train loss: 2.2956, train PPL: 128.4347


 89%|████████▊ | 201/227 [1:17:41<10:02, 23.19s/it]


>>>> end of training for epoch 1, train loss: 4.3695, train PPL: 87.2428


100%|██████████| 227/227 [1:27:31<00:00, 23.13s/it]
  0%|          | 0/227 [00:00<?, ?it/s]


>>>> training starts for epoch 2
>>>> training please wait................................


  0%|          | 1/227 [00:29<1:49:22, 29.04s/it]


>>>> end of training for epoch 2, train loss: 0.0206, train PPL: 107.6500


 44%|████▍     | 101/227 [41:31<50:17, 23.95s/it]


>>>> end of training for epoch 2, train loss: 1.9874, train PPL: 99.4584


 89%|████████▊ | 201/227 [1:22:20<12:14, 28.26s/it]


>>>> end of training for epoch 2, train loss: 3.8944, train PPL: 79.6257


 99%|█████████▊| 224/227 [1:32:29<01:12, 24.15s/it]