# Neural Machine Translation

The NMT method performs translation via deep learning. 

This methodology comprises an encoder and a decoder. It vectorizes the source sentence through the encoder, condenses the information as a context vector, and generates the translated target sentence in the decoder based on the condensed information. 

Methodologies utilized for NMT include recurrent neural networks (RNNs), convolutional neural networks (CNNs), and the
Transformer model.
The Transformer model has exhibited better performance than the other approaches.

## Encoder-Decoder GRU & Glove Embeddings

1. one vocab for english, one different vocab for french
2. one embedding matrix for english, one different embedding matrix for french
3. add the END token to english sentences, and french sentences
4. the START token is only used in the decoder as the first token

We pad all sequences in the batch with 0s up to the length of the longest sequence (this is a classic process in variable length batches and can you find plenty of posts on this subject online). 

## Import Packages

In [8]:
# !python -m spacy download en_core_web_lg

In [9]:
# !python -m spacy download fr_core_news_lg

In [10]:
import os                                 # to create 'serialised' directory
import pandas as pd
import numpy as np
import re
import time
import random
import unicodedata                        # for data cleaning
import spacy                              # for tokenisation
from collections import Counter           # this is for tokens dictionary function
from pprint import pprint

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from torch.nn.utils import clip_grad_value_, clip_grad_norm_
from torchtext.data.metrics import bleu_score

import matplotlib.pyplot as plt

In [11]:
pd.set_option('display.max_row', None)              # show all rows of a dataframe
pd.set_option('display.max_column', None)           # show all columns of a dataframe
pd.set_option('display.max_colwidth', None)         # show the full width of columns
pd.set_option('precision', 2)                       # round to 2 decimal points
pd.options.display.float_format = '{:,.2f}'.format  # comma separators and two decimal points: 4756.7890 => 4,756.79 and 4656 => 4,656.00
torch.set_printoptions(profile='full')              # prints the whole tensor
# torch.set_printoptions(profile="default")         # reset to printing the truncated tensor

Import Spacy and Load the languages

In [12]:
S_TOK = spacy.load('en_core_web_lg')                                         # for tokenisation
T_TOK = spacy.load('fr_core_news_lg')                                        # for tokenisation

In [13]:
batch_size = 32                                                             # for dataloader and training

In [14]:
torch.cuda.device_count()

In [15]:
try:
    torch.cuda.get_device_name(0)
except:
    print('no cuda available')

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')      # pytorch cuda
device

## Functions

### Random Seeds

https://pytorch.org/docs/stable/notes/randomness.html

In [17]:
seed_value = 9

def random_seeding(seed_value=seed_value):
    random.seed(seed_value)                            # python 
    np.random.seed(seed_value)                         # numpy - global seeding. Sklearn uses this internally therefore there is no need to set a random seed when using Sklearn 
    torch.manual_seed(seed_value)                      # pytorch cpu
#     torch.set_deterministic(True)                    # this raises an error when running the decoder                                       
    try:
        if device=='cuda': 
            torch.cuda.manual_seed_all(seed_value)     # pytorch gpu
    except:
        pass

### Data Cleaning

In [18]:
# Turn a Unicode string to plain ASCII: e.g. 'garçon' to 'garcon'
def unicodeToAscii(text):
    return ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn')

# Clean sentences
def clean_tokenise(text, TOK):                            # to be used when tokenizing with spacy
    text = text.lower()
    text = unicodeToAscii(text)
    text = re.sub(r"([.,!?])", r" \1 ", text)              # r" \1" : adds a white space before whatever is referenced in the group r"()" in the first part of the regex
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)           # remove non-letter characters, e.g. "he's" becomes "he s"
    text = [token.text for token in TOK.tokenizer(text)]   # for tokenization with spacy
    text.append('<EOS>')
    text.insert(0, '<SOS>')
    return text

### Vocabulary from the Traing Corpus

In [19]:
def tokens_dict(series):
    tokens = series.explode()
    tokens = tokens.tolist()
    tokens = Counter(tokens)                                    # frequency distribution. This basically is a dictionary

    unique_words_before = len(tokens.keys())
    
    #To avoid 'RuntimeError: dictionary changed size during iteration' error, we need to make a .copy() of the dictionary.
    #This way we iterate over the original dictionary keys and delete elements on the fly.
    for k,v in tokens.copy().items():
        if v < 3:
            del tokens[k]

    tokens_dictionary = {word: i+1 for i,word in enumerate(tokens.keys())}   # word2index dictionary. The '+1' is to avoid the first word in the vocabulary to have index=0 because this will be reserved for padding (see comments in 'make_embedding_matrix())
    
    unique_words_after = len(tokens_dictionary.keys())
    
#     tokens_dictionary['<SOS>'] = len(tokens_dictionary)+1
    tokens_dictionary['<UNK>'] = len(tokens_dictionary)+1
    tokens_dictionary[''] = len(tokens_dictionary)+1
    
    final_unique_words = len(tokens_dictionary.keys())
    
    reversed_dict = {int(v):k for k,v in tokens_dictionary.items()}
    
    return unique_words_before, unique_words_after, final_unique_words, tokens_dictionary, reversed_dict

### All Glove Embeddings

In [20]:
def load_glove_from_file(glove_filepath):
    
    glove_embeddings_dict = {} 
    
    with open(glove_filepath, mode='r', encoding="utf-8") as f:
        for index,line in enumerate(f):
            line_split = line.split()
            word = line_split[0]
            glove_embeddings_dict[word] = np.array(line_split[1:], 'float32')
    
    return glove_embeddings_dict

### Embedding Matrix

In [21]:
def make_embedding_matrix(glove_embeddings_dict, tokens_dictionary):  
    
    embedding_size = len(next(iter(glove_embeddings_dict.values())))          # length of each word embedding (e.g.300 dimensions)
    
    embeddings_matrix = np.zeros((len(tokens_dictionary)+1, embedding_size))  # len(tokens_dictionary) is the length of the vocabulary (i.e. unique words in the corpus)
                                                                              # the '+1' after len(tokens_dictionary) is necessary because we want the first embedding (i.e. at index 0) to be a zero vector; this will be used for the paddings (we want all the '0' paddings to be paired with the zero vector)
    not_in_glove = []
    for token,i in tokens_dictionary.items():
        if token in glove_embeddings_dict:
            embeddings_matrix[i, :] = glove_embeddings_dict[token]     
        else:
            not_in_glove.append(token)
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)                       # if word is not in Glove emebeddings, creates a new vector with random numbers drawn from pytorch xavier_uniform distribution
            embeddings_matrix[i, :] = embedding_i                

    return not_in_glove, embeddings_matrix

### Encoding & Padding

In [22]:
def encoding(input_sequence, tokens_dictionary, max_len):
    encoded = np.zeros(max_len, dtype=int)
    
    encoded_lst = []
    if not input_sequence:
        encoded_lst.append(tokens_dictionary[''])
    else:
        for word in input_sequence:
            encoded_lst.append(tokens_dictionary.get(word, tokens_dictionary['<UNK>']))
            
    encoded_lst = np.array(encoded_lst)
    
    length = min(max_len, len(encoded_lst))
    encoded[:length] = encoded_lst[:length]
    return encoded

### Encoder-Decoder Model

In [23]:
class Encoder(nn.Module):
    def __init__(self, emb_dim, emb_num, hidden_size, num_layers, dropout_p, batch_first=True, bias=True,
                 bidirectional=True, pretrained_embeddings=None, padding_idx=0):  
        
        # pretrained_embeddings = embedding_matrix;
        # padding_idx=0 makes sure that the padding vector (which in our embedding matrix is at index 0) 
        # doesn't get updated during training when Freeze=False
        
        super(Encoder, self).__init__()                      
        
        self.emb_dim = emb_dim
        self.emb_num = emb_num
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.batch_first = batch_first
        self.bias = bias
        self.bidirectional = bidirectional
        self.pretrained_embeddings = pretrained_embeddings
        self.padding_idx = padding_idx
        
        self.D = 1
        if self.bidirectional==True:
            self.D = 2
            
        # Initialise embedding layer    
        if self.pretrained_embeddings is None:

            self.emb = nn.Embedding(num_embeddings=self.emb_num,
                                    embedding_dim=self.emb_dim,
                                    padding_idx=self.padding_idx)      
        else:
            self.pretrained_embeddings = torch.from_numpy(self.pretrained_embeddings).float()
            # freeze=False : the tensor does not get updated in the learning process. 
            #Equivalent to self.emb.weight.requires_grad = False
            self.emb = nn.Embedding.from_pretrained(self.pretrained_embeddings, freeze=True, padding_idx=self.padding_idx)  
            
#             self.emb = nn.Embedding(embedding_dim=self.emb_dim,
#                                     num_embeddings=self.emb_num,
#                                     padding_idx=self.padding_idx,
#                                     _weight=self.pretrained_embeddings)
            
            
            # nn.Embedding is a model parameter layer, which is by default trainable.
            # If you want to fine-tune word vectors during training, these word vectors are treated as model parameters 
            # and are updated by backpropagation. You can also make it untrainable by freezing its gradient 
            # (False ==> freezes the backprop) 
#             self.emb.weight.requires_grad=False    
        
        
        # Initialise GRU model
        self.gru = nn.GRU(input_size=self.emb_dim, hidden_size=self.hidden_size, num_layers=self.num_layers, 
                          dropout=self.dropout_p, bidirectional=self.bidirectional, bias=self.bias, 
                          batch_first=self.batch_first) 
        
        # Regularization parameter
        self.dropout = nn.Dropout(self.dropout_p)
        
        
    def forward(self, x):        
        # length of each sequence in the batch, ignoring the padding. This is needed for Packed Sequence
        lengths = ((x > 0)*1).sum(dim=1)    
        lengths = lengths.cpu()
        
        # input_ is a list of lists. It comes from the train_loader. E.g. [[3386, 603, 1112, 0],
        #                                                                  [176, 40, 97, 0]]
        embedding = self.dropout(self.emb(x.long()))

        batch_size = embedding.shape[0]

        # hidden_state = torch.randn(self.D*self.num_layers, batch_size, self.hidden_size)
        hidden_state = torch.zeros(self.D*self.num_layers, batch_size, self.hidden_size).to(device)

        packed = pack_padded_sequence(embedding, lengths, batch_first=self.batch_first, enforce_sorted=False)
        packed_output, hidden_state = self.gru(packed, hidden_state)
        
        unpacked_out, unpacked_lens = pad_packed_sequence(packed_output)
        outputs = unpacked_out.permute(1, 2, 0)
        
        hidden_state = hidden_state[-1,:,:].unsqueeze(0)           # this extract the last hidden state in case of multiple GRU layers
        outputs = outputs[:, -self.hidden_size:, :]      # this extract the bottom hidden states in case of bidirectional GRUs
        outputs = outputs.permute(2,0,1)
        
        return outputs, hidden_state

In [24]:
class Decoder_Bahdanau_GRU(nn.Module):
    def __init__(self, emb_dim, emb_num, encoder_hidden_size, decoder_hidden_size, 
                 v_dim=3, num_layers=1, dropout_p=0.1, batch_first=True, bias=True,
                 bidirectional=False, pretrained_embeddings=None, padding_idx=0):
    
        super(Decoder_Bahdanau_GRU, self).__init__()
        
        self.emb_dim = emb_dim
        self.emb_num = emb_num
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = decoder_hidden_size
        self.v_dim = v_dim
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.batch_first = batch_first
        self.bias = bias
        self.bidirectional = bidirectional
        self.pretrained_embeddings = pretrained_embeddings
        self.padding_idx = padding_idx
        
        self.D = 1
        if self.bidirectional==True:
            self.D = 2
        
        # Initialise Embedding Layer
        if self.pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=self.emb_dim,
                                    num_embeddings=self.emb_num,
                                    padding_idx=self.padding_idx)     
        else:
            self.pretrained_embeddings = torch.from_numpy(self.pretrained_embeddings).float()
            # freeze=False : the tensor does not get updated in the learning process.
            # Equivalent to self.emb.weight.requires_grad = False
            self.emb = nn.Embedding.from_pretrained(self.pretrained_embeddings, freeze=True, padding_idx=self.padding_idx)
        
        # Initialise Attention
        self.v = nn.Linear(self.v_dim, 1, bias=False)
        self.w1 = nn.Linear(self.decoder_hidden_size, self.v_dim, bias=False)                # decoder_hidden_state
        self.w2 = nn.Linear(self.encoder_hidden_size, self.v_dim, bias=False)                # encoder_output
        
        # Initialise GRU
        self.gru = nn.GRU(input_size=self.emb_dim+self.decoder_hidden_size, hidden_size=self.decoder_hidden_size, 
                          num_layers=self.num_layers, dropout=self.dropout_p, bidirectional=self.bidirectional, 
                          bias=self.bias, batch_first=self.batch_first)
        
        # Initialise Linear Layer
        self.fc1 = nn.Linear(self.decoder_hidden_size, self.emb_num, bias=False)
        
        # Regularization parameter
        self.dropout = nn.Dropout(self.dropout_p)
        

# input_ is a tensor of shape [batch_size, 1]; for instance if batch_size=3: [[1.],
#                                                                             [1.],
#                                                                             [1.]]
    def forward(self, x, hidden_state, output):
        x = x.unsqueeze(0)
        embedding = self.emb(x.long())
        
        w1h = self.w1(hidden_state)
        w1h = w1h.permute(1,0,2)
        w2eo = self.w2(output.permute(1,0,2)) 
        sum_ = w2eo + w1h.expand_as(w2eo)
        tanh_ = torch.tanh(sum_)
        alignment_scores = self.v(tanh_)
        
        # Mask alignment_scores so that the softmax calculation to get the attn_weights will ingnore the paddings
        mask = (alignment_scores == 0) * 1
        mask = mask.bool()
        masked_alignments = alignment_scores.masked_fill(mask=mask, value=-np.inf)
        
        attn_weights = F.softmax(masked_alignments, dim=1)                # this is the normal softmax 
        attn_weights = self.dropout(attn_weights)
        
        context_vector = torch.matmul(output.permute(1,2,0), attn_weights)
        # context_vector = torch.bmm(output.permute(1,2,0), attn_weights)  #equivalent to line above
        
        concat = torch.cat((embedding.permute(1,2,0), context_vector), dim=1)
        concat = concat.permute(0,2,1)
        
        # Cannot use packing in the decoder because tokens are processed one at a time
        outputs, hidden_state = self.gru(concat, hidden_state)

        predictions = self.fc1(outputs)        
        predictions = predictions.squeeze(1)

        return predictions, hidden_state

In [25]:
class Decoder_dot_GRU(nn.Module):
    def __init__(self, emb_dim, emb_num, encoder_hidden_size, decoder_hidden_size, num_layers=1, dropout_p=0.1, 
                 batch_first=True, bias=True, bidirectional=False, pretrained_embeddings=None, padding_idx=0):
    
        super(Decoder_dot_GRU, self).__init__()
        
        self.emb_dim = emb_dim
        self.emb_num = emb_num
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = decoder_hidden_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.batch_first = batch_first
        self.bias = bias
        self.bidirectional = bidirectional
        self.pretrained_embeddings = pretrained_embeddings
        self.padding_idx = padding_idx
        
        self.D = 1
        if self.bidirectional==True:
            self.D = 2
        
        # Initialise Embedding Layer
        if self.pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=self.emb_dim,
                                    num_embeddings=self.emb_num,
                                    padding_idx=self.padding_idx)     
        else:
            self.pretrained_embeddings = torch.from_numpy(self.pretrained_embeddings).float()
            # freeze=False : the tensor does not get updated in the learning process.
            # Equivalent to self.emb.weight.requires_grad = False
            self.emb = nn.Embedding.from_pretrained(self.pretrained_embeddings, freeze=True, padding_idx=self.padding_idx)
        
        # Initialise GRU
        self.gru = nn.GRU(input_size=self.emb_dim, hidden_size=self.decoder_hidden_size, num_layers=self.num_layers,
                          dropout=self.dropout_p, bidirectional=self.bidirectional, bias=self.bias,
                          batch_first=self.batch_first)
        
        # Initialise Linear Layer
        self.fc1 = nn.Linear(self.encoder_hidden_size+self.decoder_hidden_size, self.emb_num, bias=False)
        
        self.dropout = nn.Dropout(self.dropout_p)
        
            
# input_ is a tensor of shape [batch_size, 1]; for instance if batch_size=3: [[1.],
#                                                                             [1.],
#                                                                             [1.]]
    def forward(self, x, hidden_state, output):
        x = x.unsqueeze(0)
        embedding = self.emb(x.long())
        
        # Cannot use packing in the decoder because tokens are processed one at a time
        doutput, hidden_state = self.gru(embedding.permute(1,0,2), hidden_state)
        
        # Attention scores
        #.matmul works on matrix-vector multiplication, whereas .bmm only works on matrix-matrix multiplications
        alignment_scores = torch.matmul(hidden_state.permute(1,0,2), output.permute(1,2,0))
        alignment_scores = alignment_scores.permute(0,2,1)
        
        # Mask alignment_scores so that the softmax calculation to get the attn_weights will ingnore the paddings
        mask = (alignment_scores == 0) * 1
        mask = mask.bool()
        masked_alignments = alignment_scores.masked_fill(mask=mask, value=-np.inf)
         
        attn_weights = F.softmax(masked_alignments, dim=1)                # this is the normal softmax 
        attn_weights = self.dropout(attn_weights)

        context_vector = torch.matmul(output.permute(1,2,0), attn_weights)
        concat = torch.cat((hidden_state.permute(1,2,0), context_vector), dim=1)
        concat = concat.permute(0,2,1)
        
        predictions = self.fc1(concat.squeeze(0))
        predictions = predictions.squeeze(1)

        return predictions, hidden_state

In [26]:
class Decoder_Luong_GRU(nn.Module):
    def __init__(self, emb_dim, emb_num, encoder_hidden_size, decoder_hidden_size, num_layers=1, dropout_p=0.1, 
                 batch_first=True, bias=True, bidirectional=False, pretrained_embeddings=None, padding_idx=0):
    
        super(Decoder_Luong_GRU, self).__init__()
        
        self.emb_dim = emb_dim
        self.emb_num = emb_num
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = decoder_hidden_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.batch_first = batch_first
        self.bias = bias
        self.bidirectional = bidirectional
        self.pretrained_embeddings = pretrained_embeddings
        self.padding_idx = padding_idx
        
        self.D = 1
        if self.bidirectional==True:
            self.D = 2
        
        # Initialise Embedding Layer
        if self.pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=self.emb_dim,
                                    num_embeddings=self.emb_num,
                                    padding_idx=self.padding_idx)     
        else:
            self.pretrained_embeddings = torch.from_numpy(self.pretrained_embeddings).float()
            # freeze=False : the tensor does not get updated in the learning process.
            # Equivalent to self.emb.weight.requires_grad = False
            self.emb = nn.Embedding.from_pretrained(self.pretrained_embeddings, freeze=True, padding_idx=self.padding_idx)
        
        # Initialise GRU
        self.gru = nn.GRU(input_size=self.emb_dim, hidden_size=self.decoder_hidden_size, num_layers=self.num_layers,
                          dropout=self.dropout_p, bidirectional=self.bidirectional, bias=self.bias,
                          batch_first=self.batch_first)
        
        # Initialise Attention
        self.w = nn.Linear(self.decoder_hidden_size, self.encoder_hidden_size, bias=False)
        
        # Initialise Linear Layer
        self.fc1 = nn.Linear(self.encoder_hidden_size+self.decoder_hidden_size, self.emb_num, bias=False)
        
        self.dropout = nn.Dropout(self.dropout_p)
        
            
# input_ is a tensor of shape [batch_size, 1]; for instance if batch_size=3: [[1.],
#                                                                             [1.],
#                                                                             [1.]]
    def forward(self, x, hidden_state, output):
        x = x.unsqueeze(0)
        embedding = self.emb(x.long())
        
        # Cannot use packing in the decoder because tokens are processed one at a time
        doutput, hidden_state = self.gru(embedding.permute(1,0,2), hidden_state)
        
        # Attention scores
        attn_hidden = self.w(hidden_state)
        alignment_scores = torch.matmul(attn_hidden.permute(1,0,2), output.permute(1,2,0))
        alignment_scores = alignment_scores.permute(0,2,1)
        
        # Mask alignment_scores so that the softmax calculation to get the attn_weights will ingnore the paddings
        mask = (alignment_scores == 0) * 1
        mask = mask.bool()
        masked_alignments = alignment_scores.masked_fill(mask=mask, value=-np.inf)
        
        
        attn_weights = F.softmax(masked_alignments, dim=1)                # this is the normal softmax 
        attn_weights = self.dropout(attn_weights)

        context_vector = torch.matmul(output.permute(1,2,0), attn_weights)
        concat = torch.cat((hidden_state.permute(1,2,0), context_vector), dim=1)
        concat = concat.permute(0,2,1)
        
        predictions = self.fc1(concat.squeeze(0))
        predictions = predictions.squeeze(1)

        return predictions, hidden_state        

In [27]:
class Seq2Seq(nn.Module):
    def __init__(self, Encoder, Decoder, target_vocab_size):
        super(Seq2Seq, self).__init__()
        self.Encoder = Encoder
        self.Decoder = Decoder
        self.target_vocab_size = target_vocab_size
        
    def forward(self, source, target, tfr=0.5):
        batch_size, target_len = target.shape

        outputs = torch.zeros(target_len, batch_size, self.target_vocab_size).to(device)

        e_output, hidden_state = self.Encoder(source)

        x = target[:,0] # Trigger token <SOS>

        for i in range(1, target_len):
            d_output, hidden_state = self.Decoder(x, hidden_state, e_output)
            outputs[i] = d_output
            best_guess = d_output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
            x = target[:,i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

        return outputs

### Parameters

In [28]:
def params(full_embedding_layer_name):
    for param in full_embedding_layer_name.parameters():
        return param


def check_params(classifier_name):
    for name, child in classifier_name.named_children():
        print('Layer name: {} --- {}'.format(name, child), end='\n\n')            
        print('ToT Params: {:,}'. format(sum(p.numel() for p in child.parameters())), end='\n\n') 
    
        count = 0
        for param in child.parameters():
            print('Param length: {:,}'.format(len(param)), end='\n\n')
            print(param, end='\n\n')
            print('Are parameters being updated during backprop? {}'.format(param.requires_grad), end='\n\n')
            count += 1

        print('Total Sets of Parameters: {}'.format(count), end='\n\n')
        print('*' * 90)
    

def num_params(classifier_name):
    
    # PyTorch torch.numel() method returns the total number of elements in the input tensor
    trainable_parameters = sum(param.numel() for param in classifier_name.parameters() if param.requires_grad)  
    all_parameters = sum(param.numel() for param in classifier_name.parameters())  
    
    return trainable_parameters, all_parameters

### Translation

In [29]:
def translate(encoded_s, max_length, tgt_vocab_w2i, tgt_vocab_i2w):
    # Convert to tensor
    tensor = torch.Tensor(encoded_s)
    tensor = tensor.unsqueeze(0).long().to(device)
#     print('tensor shape: {}'.format(tensor.shape))
#     print('tensor: {}'.format(tensor))
    
    with torch.no_grad():
        outp, hidden = model.Encoder(tensor)
        
    outputs = torch.tensor(tgt_vocab_w2i['<SOS>']).reshape(1, 1).to(device)
#     print('outputs shape: {}'.format(outputs.shape))
#     print('outputs: {}'.format(outputs))
    
    for _ in range(max_length):
        previous_word = outputs[:,-1]
#         print('previous_word: {}'.format(previous_word))
        output, hidden = model.Decoder(previous_word, hidden, outp)
        best_guess = output.argmax(1).item()
#         print('best_guess: {}'.format(best_guess))
        best_guess = torch.tensor(best_guess).reshape(1,1).to(device)
#         print('best_guess shape: {}'.format(best_guess.shape))
#         print('best_guess: {}'.format(best_guess))
        
        outputs = torch.cat((outputs, best_guess), 1)
#         print('outputs type: {}'.format(type(outputs)))
#         print('outputs shape: {}'.format(outputs.shape))
#         print('outputs: {}'.format(outputs))
#         print()

        # Model predicts it's the end of the sentence
        if best_guess == tgt_vocab_w2i['<EOS>']:
            break
            
    outputs = outputs.squeeze(0)
#     print('outputs type: {}'.format(type(outputs)))
#     print('outputs shape: {}'.format(outputs.shape))
#     print('outputs: {}'.format(outputs))
#     print()
    
    outputs = outputs.tolist()
#     print('outputs type: {}'.format(type(outputs)))
#     print('outputs: {}'.format(outputs))
#     print()
    
    translated_sentence = [tgt_vocab_i2w[idx] for idx in outputs]
    return translated_sentence[1:]

In [30]:
def translation_e2e(sentences, max_length):
    if type(sentences) != pd.core.series.Series:
        # 1. Convert list to pandas series
        sentences = pd.Series(sentences)
    
    # 2. Clean sentences
    sentences_clean = sentences.map(lambda x: clean_tokenise(text=x, TOK=S_TOK))

    # 3. Encode sentences
    sentences_encoded = sentences_clean.map(lambda x: encoding(input_sequence=x, 
                                                               tokens_dictionary=X_dict_w2i, 
                                                               max_len=s_max_len))
    
    # 4. Translate
    translations = sentences_encoded.map(lambda x: translate(encoded_s=x, 
                                                             max_length=max_length,
                                                             tgt_vocab_w2i=y_dict_w2i,
                                                             tgt_vocab_i2w=y_dict_i2w))
    return translations

### Inference

In [31]:
def infer(source_samples, max_length=20):
    if type(source_samples) != pd.core.series.Series:
        source_samples = pd.Series(source_samples)
        
    translations = translation_e2e(source_samples, max_length=max_length)
    
    return translations

### Bleu Score

In [32]:
def bleu(source_samples, references_corpus, max_length=20, TOK=T_TOK):
    candidate_corpus = infer(source_samples, max_length=max_length)
    candidate_corpus = candidate_corpus.map(lambda x: x[:-1])
    
    references_corpus = references_corpus.map(lambda x: clean_tokenise(text=x, TOK=TOK))
    references_corpus = references_corpus.map(lambda x: x[1:-1])
    
    bleu = bleu_score(candidate_corpus, references_corpus)
    
    return candidate_corpus, references_corpus, bleu

# Code

### Random Seeds

In [33]:
random_seeding()

### Read / Split / Clean Datasets

In [34]:
dir_csv = '../input/simplest-eng-fra/simplest_eng_fra.csv'  # Kaggle directory
dir_txt = '../input/engfra/eng-fra.txt'                     # Kaggle directory

In [None]:
# cwd = os.getcwd()  # get current directory on CPU (i.e. where the notebook is saved)
# print(cwd)

# dir_csv = cwd + 'data\simplest_eng_fra.csv'
# dir_txt = cwd + '\data\eng-fra.txt'
# print(dir_csv)
# print(dir_txt)

In [None]:
data = pd.read_csv(dir_csv, encoding='utf-8', sep=',', header=0, index_col=0)
inf_data = pd.read_csv(dir_txt, encoding='utf-8', sep='\t', header=0, names=['eng', 'fra'])   # for inference

In [35]:
print(inf_data.shape)
print(inf_data.head())

In [36]:
print(data.shape)
print(data.head())

In [37]:
# drop duplicate translations
data = data.drop_duplicates()
print(data.shape)
print(data.head())

In [38]:
# Shuffle the data to remove bias in dev set selection.
data = data.sample(frac=1).reset_index(drop=True)
print(data.shape)
print(data.head())

In [39]:
# Remove empty rows
data = data.dropna()
print(data.shape)
print(data.head())

In [40]:
X_train_full = data.loc[data['split'] == 'train', 'source_language'].copy(deep=True)
X_val_full = data.loc[data['split'] == 'val', 'source_language'].copy(deep=True)
X_test_full = data.loc[data['split'] == 'test', 'source_language'].copy(deep=True)
y_train_full = data.loc[data['split'] == 'train', 'target_language'].copy(deep=True)
y_val_full = data.loc[data['split'] == 'val', 'target_language'].copy(deep=True)
y_test_full = data.loc[data['split'] == 'test', 'target_language'].copy(deep=True)

In [41]:
print(type(X_train_full), X_train_full.shape)
print(type(X_val_full), X_val_full.shape)
print(type(X_test_full), X_test_full.shape)
print(type(y_train_full), y_train_full.shape)
print(type(y_val_full), y_val_full.shape)
print(type(y_test_full), y_test_full.shape)

### Samples

In [42]:
# train_rows = 5
# val_rows = 5
# test_rows = 5
train_rows = len(X_train_full)
val_rows = len(X_val_full)
test_rows = len(X_test_full)

In [43]:
X_train_subset = X_train_full.head(train_rows).copy(deep=True)
y_train_subset = y_train_full.head(train_rows).copy(deep=True)
print(X_train_subset.shape)
print(y_train_subset.shape)
print(X_train_subset.head())
print(y_train_subset.head())

In [44]:
X_val_subset = X_val_full.head(val_rows).copy(deep=True)
y_val_subset = y_val_full.head(val_rows).copy(deep=True)
print(X_val_subset.shape)
print(y_val_subset.shape)
print(X_val_subset.head())
print(y_val_subset.head())

In [45]:
X_test_subset = X_test_full.head(test_rows).copy(deep=True)
y_test_subset = y_test_full.head(test_rows).copy(deep=True)
print(X_test_subset.shape)
print(y_test_subset.shape)
print(X_test_subset.head())
print(y_test_subset.head())

### Data Cleaning And Tokenization

In addition to creating a subset that has three partitions for training, validation, and testing, we also minimally clean the data by adding whitespace around punctuation symbols and removing extraneous symbols that aren’t punctuation for all the splits.


1. **apply** works on a row / column basis of a DataFrame 
2. **applymap** works element-wise on a DataFrame
3. **map** works element-wise on a Series

In [46]:
X_train = X_train_subset.map(lambda x: clean_tokenise(text=x, TOK=S_TOK))
X_val = X_val_subset.map(lambda x: clean_tokenise(text=x, TOK=S_TOK))
X_test = X_test_subset.map(lambda x: clean_tokenise(text=x, TOK=S_TOK))
y_train = y_train_subset.map(lambda x: clean_tokenise(text=x, TOK=T_TOK))
y_val = y_val_subset.map(lambda x: clean_tokenise(text=x, TOK=T_TOK))
y_test = y_test_subset.map(lambda x: clean_tokenise(text=x, TOK=T_TOK))

In [47]:
print(X_train.head(), '\n')
print(y_train.head(), '\n\n')

print(X_val.head(), '\n')
print(y_val.head(), '\n\n')

print(X_test.head(), '\n')
print(y_test.head())

### Max length of source and target sentences

In [48]:
X_train_max_len = X_train.map(lambda x: len(x)).max()
X_val_max_len = X_val.map(lambda x: len(x)).max()
X_test_max_len = X_test.map(lambda x: len(x)).max()
y_train_max_len = y_train.map(lambda x: len(x)).max()
y_val_max_len = y_val.map(lambda x: len(x)).max()
y_test_max_len = y_test.map(lambda x: len(x)).max()

In [49]:
print(X_train_max_len)
print(X_val_max_len)
print(X_test_max_len)
print(y_train_max_len)
print(y_val_max_len)
print(y_test_max_len)

In [50]:
# for encoding and padding  
s_max_len = max(X_train_max_len, X_val_max_len, X_test_max_len)
t_max_len = max(y_train_max_len, y_val_max_len, y_test_max_len)

In [51]:
print(s_max_len)
print(t_max_len)

In [52]:
# This is needed to calculate the epoch loss per token during training
# We do not include the <SOS> token in the count!
y_train_tot_tokens = y_train.map(lambda x: len(x)).sum()
y_val_tot_tokens = y_val.map(lambda x: len(x)).sum()
y_test_tot_tokens = y_test.map(lambda x: len(x)).sum()

In [53]:
print(y_train_tot_tokens)
print(y_val_tot_tokens)
print(y_test_tot_tokens)

### Glove

In [54]:
# From glove txt, create a dictionary of all glove embeddings where KEY is a WORD, and VALUE is a NUMPY ARRAY:
# glove_embeddings = load_glove_from_file('C:/GloVe/glove.6B.300d.txt')
glove_embeddings = load_glove_from_file('../input/glove6b300dtxt/glove.6B.300d.txt')

In [55]:
print(type(glove_embeddings))
print(len(glove_embeddings))

### Create Vocabulary From Training Corpus

The embedding matrix (see later) is created only from the training dataset.

The training dataset should be sufficiently rich/representative enough to cover all data you expect to see in the future.

New data must have the same integer encoding as the training data prior to being mapped onto the embedding when making a prediction.

In [56]:
# create a DICT of the unique words in the training set
X_words_before, X_words_after, X_final_words, X_dict_w2i, X_dict_i2w = tokens_dict(X_train)
y_words_before, y_words_after, y_final_words, y_dict_w2i, y_dict_i2w = tokens_dict(y_train)

In [57]:
print(X_words_before)
print(X_words_after)
print(len(X_dict_w2i))
print()
print(y_words_before)
print(y_words_after)
print(len(y_dict_w2i))

### Create Embedding Matrix

The embedding is created from the training dataset.

It should be sufficiently rich/representative enough to cover all data you expect to in the future.

New data must have the same integer encoding as the training data prior to being mapped onto the embedding when making a prediction.

In [58]:
X_excluded, X_embeddings_matrix = make_embedding_matrix(glove_embeddings, X_dict_w2i) # embedding matrix is a numpy array
y_excluded, y_embeddings_matrix = make_embedding_matrix(glove_embeddings, y_dict_w2i) # embedding matrix is a numpy array

In [59]:
# print(X_excluded)
# print(len(X_excluded))
# print(type(X_embeddings_matrix))
# print(X_embeddings_matrix.shape)
# print()
# print(y_excluded)
# print(len(y_excluded))
# print(type(y_embeddings_matrix))
# print(y_embeddings_matrix.shape)

### Encoding Training Dataset

We need to convert our text into a numerical form that can be fed to our model as input.

1. We have create a vocabulary (see section '10. Vocabulary') where each key is a unique word from the training corpus, and each value is the index of that word in the 'tokens' dictionary.
2. Choose the maximum length of any review.
3. Encode each list of tokens by replacing each word with its index from the 'tokens' dictionary.

Note: **mean_len** (see below) is the mean of tokens length in the training set. We set the max length of the encoded reviews equal to the mean_len.

In [60]:
X_train_encoded = X_train.map(lambda x: encoding(input_sequence=x, tokens_dictionary=X_dict_w2i, max_len=s_max_len))
X_val_encoded = X_val.map(lambda x: encoding(input_sequence=x, tokens_dictionary=X_dict_w2i, max_len=s_max_len))
X_test_encoded = X_test.map(lambda x: encoding(input_sequence=x, tokens_dictionary=X_dict_w2i, max_len=s_max_len))

In [61]:
y_train_encoded = y_train.map(lambda x: encoding(input_sequence=x, tokens_dictionary=y_dict_w2i, max_len=t_max_len))
y_val_encoded = y_val.map(lambda x: encoding(input_sequence=x, tokens_dictionary=y_dict_w2i, max_len=t_max_len))
y_test_encoded = y_test.map(lambda x: encoding(input_sequence=x, tokens_dictionary=y_dict_w2i, max_len=t_max_len))

In [62]:
print(X_train_encoded.head())
print()
print(y_train_encoded.head())

In [63]:
print(X_val_encoded.head())
print()
print(y_val_encoded.head())

In [64]:
print(X_test_encoded.head())
print()
print(y_test_encoded.head())

### PyTorch Dataset

In [65]:
# Convert pd.Series to PyTorch Tensors
# NB: set the values in X_train, X_val and X_test as a list of arrays (as opposed to array of arrays) --- see above

x_train_tensor = torch.Tensor(list(X_train_encoded.values))
x_val_tensor = torch.Tensor(list(X_val_encoded.values))
x_test_tensor = torch.Tensor(list(X_test_encoded.values))
y_train_tensor = torch.Tensor(list(y_train_encoded.values))
y_val_tensor = torch.Tensor(list(y_val_encoded.values))
y_test_tensor = torch.Tensor(list(y_test_encoded.values))

In [66]:
# Create a full dataset (like a DataFrame in Pandas) from the two tensors
train_dataset =  TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

### PyTorch DataLoader

In [67]:
# For small dataset is fine to use the whole training data at every training step (i.e. batch gradient descent). 
# If we want to go serious about all this, we must use mini-batch gradient descent. Thus, we need mini-batches. 
# Thus, we need to slice our dataset accordingly. Do you want to do it manually?! Me neither!
# So we use the 'DataLoader' class for this job. We tell it which dataset to use, the desired mini-batch size and if we’d 
# like to shuffle it or not. That’s it!
# Our loader will behave like an iterator, so we can loop over it and fetch a different mini-batch every time.

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=1)

# To retrieve a sample mini-batch, one can simply run the command below.
# It will return a list containing two tensors: one for the features, another one for the labels:
# next(iter(train_loader))

### Initialise Classifier

At its core, the training routine is responsible for instantiating the model, iterating over the dataset, computing the output of the model when given the data as input, computing the loss (how wrong the model is), and updating the model proportional to the loss. 

Although this may seem like a lot of details to manage, there are not many places to change the training routine, and as such it will become habitual in your deep learning development process.

In [68]:
print(len(X_dict_w2i))
print(X_embeddings_matrix.shape)

In [69]:
print(len(y_dict_w2i))
print(y_embeddings_matrix.shape)

In [70]:
# Initialise encoder & decoder

e_emb_num, e_emb_dim = X_embeddings_matrix.shape
e_hidden_size = 1024
e_num_layers = 1
e_dropout_p = 0.5
e_batch_first = True
e_bias=True
e_bidirectional=True
e_pretrained_embeddings=X_embeddings_matrix

d_emb_num, d_emb_dim = y_embeddings_matrix.shape
v_dim=3
d_num_layers = 1
d_hidden_size = 1024
d_dropout_p = 0.5
d_batch_first = True
d_bias=True
d_bidirectional=False
d_pretrained_embeddings=y_embeddings_matrix

padding_idx = 0



encoder_gru = Encoder(emb_dim=e_emb_dim, emb_num=e_emb_num, hidden_size=e_hidden_size, num_layers=e_num_layers, 
                      dropout_p=e_dropout_p, batch_first=e_batch_first, bias=e_bias, bidirectional=e_bidirectional, 
                      pretrained_embeddings=e_pretrained_embeddings, padding_idx=padding_idx).to(device)


decoder_gru = Decoder_Bahdanau_GRU(emb_dim=d_emb_dim, emb_num=d_emb_num, encoder_hidden_size=e_hidden_size, 
                                   decoder_hidden_size=d_hidden_size, v_dim=v_dim, num_layers=d_num_layers, 
                                   dropout_p=d_dropout_p, batch_first=d_batch_first, bias=d_bias, 
                                   bidirectional=d_bidirectional, pretrained_embeddings=d_pretrained_embeddings, 
                                   padding_idx=padding_idx).to(device)

# decoder_gru = Decoder_dot_GRU(emb_dim=d_emb_dim, emb_num=d_emb_num, encoder_hidden_size=e_hidden_size, 
#                                    decoder_hidden_size=d_hidden_size, num_layers=d_num_layers, 
#                                    dropout_p=d_dropout_p, batch_first=d_batch_first, bias=d_bias, 
#                                    bidirectional=d_bidirectional, pretrained_embeddings=d_pretrained_embeddings, 
#                                    padding_idx=padding_idx).to(device)

# decoder_gru = Decoder_Luong_GRU(emb_dim=d_emb_dim, emb_num=d_emb_num, encoder_hidden_size=e_hidden_size, 
#                                 decoder_hidden_size=d_hidden_size, num_layers=d_num_layers, 
#                                 dropout_p=d_dropout_p, batch_first=d_batch_first, bias=d_bias, 
#                                 bidirectional=d_bidirectional, pretrained_embeddings=d_pretrained_embeddings, 
#                                 padding_idx=padding_idx).to(device)

print(encoder_gru)
print()
print(decoder_gru)

In [71]:
# Hyperparameters

learning_rate = 0.001

model = Seq2Seq(encoder_gru, decoder_gru, target_vocab_size=d_emb_num).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)

In [72]:
model

## Training loop

The training loop is composed of two loops: an inner loop over minibatches in the dataset, and an outer loop, which repeats the inner loop a number of times. In the inner loop, losses are computed for each minibatch, and the optimizer is used to
update the model parameters.

#### Encoder parameters before training

In [73]:
encoder_params_before = params(encoder_gru.emb)
# print(encoder_params_before, '\n')

# # check params initialised by the classifier
# print(check_params(encoder_gru), '\n')

# trainable / all params before training
encoder_trainable_params, encoder_all_params = num_params(encoder_gru)
print('The encoder has {:,} trainable parameters'.format(encoder_trainable_params))
print('The encoder has {:,} parameters overall'.format(encoder_all_params))

#### Decoder parameters before training

In [74]:
decoder_params_before = params(decoder_gru.emb)
# print(decoder_params_before, '\n')

# # check params initialised by the classifier
# print(check_params(decoder_gru), '\n')

# trainable / all params before training
decoder_trainable_params, decoder_all_params = num_params(decoder_gru)
print('The decoder has {:,} trainable parameters'.format(decoder_trainable_params))
print('The decoder has {:,} parameters overall'.format(decoder_all_params))

In [75]:
start = time.time()

num_epochs = 10
num_epoch_freezed = 3

epoch_loss_train = 0.0
epoch_loss_train_lst = []

epoch_loss_val = 0.0
epoch_loss_val_lst = []

s = 'He is doing a great job!'
# s = ['He is doing a great job!']
# s = pd.Series(['He is doing a great job!'])

# For a certain number of epochs (defined by 'n_epoch_freezed'), the emebdding matrix is frozen, then it is unfrozen 
# i.e. the embeddings get trained (except for the padding vector which remains 0)
for epoch in range(num_epochs):
    if epoch < num_epoch_freezed:   
        pass   # keep the embedding layer frozen (i.e. classifier.emb.weight.requires_grad=False as set in section 8 above)
    else: 
        encoder_gru.emb.weight.requires_grad=True
        decoder_gru.emb.weight.requires_grad=True


# for epoch in range(num_epochs):
    print("Epoch: {} / {}".format(epoch+1, num_epochs))
    
    ####################################################### TRAINING ###################################################
    
    model.train(True)
        
    for i, (x_train, y_train) in enumerate(train_loader):
        input = x_train.long().to(device)
        target = y_train.long().to(device)

        # Pass the input and target for model's forward method
        output = model(input, target, tfr=0.5)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target.permute(1,0)
        target = target[1:].reshape(-1)
        
        # Clear the accumulating gradients
        optimizer.zero_grad()

        # Calculate the loss value for every epoch
        loss = criterion(output, target)

        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

        # Clip the gradient value is it exceeds > 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the weights values using the gradients we calculated using bp 
        optimizer.step()
        epoch_loss_train += loss.item()
        
#     print("Epoch_Loss Train: {}".format(epoch_loss_train / len(train_loader) / (epoch+1)))
    epoch_loss_train_lst.append(epoch_loss_train / len(train_loader) / (epoch+1))
    
    ###################################################### VALIDATION ##################################################
    
    model.eval()
    
    #We use torch.no_grad() which reduces memory usage and speeds up computation.
    with torch.no_grad():     #https://discuss.pytorch.org/t/model-eval-vs-with-torch-no-grad/19615/3 : torch.no_grad() deals with the autograd engine and stops it from calculating the gradients, which is the recommended way of doing validation
        for i, (x_val, y_val) in enumerate(val_loader):
            input = x_val.long().to(device)
            target = y_val.long().to(device) 
            
            # Pass the input and target for model's forward method
            output = model(input, target, tfr=0.5)
            
            output = output[1:].reshape(-1, output.shape[2])
            target = target.permute(1,0)
            target = target[1:].reshape(-1)
            
            # Calculate the loss value for every epoch
            loss = criterion(output, target)
            epoch_loss_val += loss.item()
    
    epoch_loss_val_lst.append(epoch_loss_val / len(val_loader) / (epoch+1))
    
    
    print('Train Loss: {:.6f} | Val Loss: {:.6f}'.format(epoch_loss_train / len(train_loader) / (epoch+1),
                                                         epoch_loss_val / len(val_loader) / (epoch+1))) 
    
    ###################################################### TRANSLATION ################################################    
    
    translation = translation_e2e(s, 20)
    print('Translated sentence: \n {}'.format(translation))    
    print()
    
end = time.time()

In [76]:
print('It took {:,.0f} mins to complete'.format((end - start)/60))

In [77]:
plt.plot(range(len(epoch_loss_train_lst)), epoch_loss_train_lst, label = "train losses")
plt.plot(range(len(epoch_loss_val_lst)), epoch_loss_val_lst, label = "val losses")


plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.title('Train and Val Losses by Epoch')
plt.legend()
plt.show()

#### Encoder parameters after training

In [78]:
encoder_params_after = params(encoder_gru.emb)
# print(encoder_params_after, '\n')

# # check changes in params
# print(encoder_params_after == encoder_params_before, '\n')

# # check gradients
# print(encoder_gru.emb.weight.grad, '\n')

# trainable / all params after training
encoder_trainable_params, encoder_all_params = num_params(encoder_gru)
print('The encoder has {:,} trainable parameters'.format(encoder_trainable_params))
print('The encoder has {:,} parameters overall'.format(encoder_all_params))

#### Decoder parameters after training

In [79]:
decoder_params_after = params(decoder_gru.emb)
# print(decoder_params_after, '\n')

# # check changes in params
# print(decoder_params_after == decoder_params_before, '\n')

# # check gradients
# print(decoder_gru.emb.weight.grad, '\n')

# trainable / all params after training
decoder_trainable_params, decoder_all_params = num_params(decoder_gru)
print('The decoder has {:,} trainable parameters'.format(decoder_trainable_params))
print('The decoder has {:,} parameters overall'.format(decoder_all_params))

In [80]:
epoch_loss_test = 0.0

model.eval()
    
#We use torch.no_grad() which reduces memory usage and speeds up computation.
with torch.no_grad():     #https://discuss.pytorch.org/t/model-eval-vs-with-torch-no-grad/19615/3 : torch.no_grad() deals with the autograd engine and stops it from calculating the gradients, which is the recommended way of doing validation
    for i, (x_test, y_test) in enumerate(test_loader):
        input = x_test.long().to(device)
        target = y_test.long().to(device) 

        # Pass the input and target for model's forward method
        output = model(input, target, tfr=0.5)

        output = output[1:].reshape(-1, output.shape[2])
        target = target.permute(1,0)
        target = target[1:].reshape(-1)

        # Calculate the loss value for every epoch
        loss = criterion(output, target)
        epoch_loss_test += loss.item()

print('Test Loss: {:.6f}'.format(epoch_loss_test / len(test_loader)))

## Inference

In [81]:
inf_data_sample = inf_data.sample(20)
inf_data_sample

In [82]:
start = time.time()
candidate, reference, bleuscore = bleu(inf_data_sample['eng'], inf_data_sample['fra'], max_length=20, TOK=T_TOK)
end = time.time()

In [83]:
print('It took {:,.0f} mins to complete'.format((end - start)/60))

In [84]:
candidate = candidate.map(lambda x: ' '.join(x))

In [88]:
comparison = pd.DataFrame({'source': inf_data_sample['eng'],
                           'reference': inf_data_sample['fra'],
                           'candidate': candidate})

In [89]:
comparison

In [90]:
bleuscore