In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os, re, pickle, collections, bcolz, string
import numpy as np
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook

from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


The objective of this project is to build a neural machine translation model to translate english to french. As this project is for study purpose, the scope is limited to english questions that start with 'Wh' (What, Who, Where, Why, etc).

## Corpus preprocessing

We will use <a href='http://www.statmt.org/wmt10/training-giga-fren.tar'>French-English 109 corpus</a> dataset, crawled from Canadian and European Union sources.

In [2]:
PATH = os.getcwd()
corpus_path = PATH + '/data/fr-en-109-corpus'

In [3]:
fname = 'giga-fren.release2.fixed.'

In [4]:
en_corpus_path = f'{corpus_path}/{fname}en' 
fr_corpus_path = f'{corpus_path}/{fname}fr' 

Let's find all english sentences that start with 'Wh' and end with '?'. On the other hand, we want to find all french sentences that just end with '?'.

In [5]:
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

In [6]:
lines = ((re_eq.search(eq), re_fq.search(fq)) for eq, fq in zip(open(en_corpus_path), open(fr_corpus_path)))

In [7]:
questions = [(en.group(), fr.group()) for en, fr in lines if en and fr]

In [8]:
len(questions)

52331

We have 52331 english questions with their corresponding french translations.

In [9]:
questions[:5]

[('What is light ?', 'Qu’est-ce que la lumière?'),
 ('Who are we?', 'Où sommes-nous?'),
 ('Where did we come from?', "D'où venons-nous?"),
 ('What would we do without it?', 'Que ferions-nous sans elle ?'),
 ('What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?',
  'Quelle sont les coordonnées (latitude et longitude) de Badger, à Terre-Neuve-etLabrador?')]

In [20]:
#pickle.dump(questions, open(f'{PATH}/data/translate/en-fr-questions.pkl', 'wb'))

In [3]:
questions = pickle.load(open(f'{PATH}/data/translate/en-fr-questions.pkl', 'rb'))

In [4]:
en_questions, fr_questions = zip(*questions)

## Tokenization

Define tokenizer.

In [5]:
re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

def tokenize(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

Tokenize english questions.

In [6]:
en_tokens = list(map(tokenize, en_questions))

In [7]:
en_tokens[:4]

[['what', 'is', 'light', '?'],
 ['who', 'are', 'we', '?'],
 ['where', 'did', 'we', 'come', 'from', '?'],
 ['what', 'would', 'we', 'do', 'without', 'it', '?']]

Tokenize french questions.

In [8]:
fr_tokens = list(map(tokenize, fr_questions))

In [9]:
fr_tokens[:4]

[['qu’', 'est', 'ce', 'que', 'la', 'lumière', '?'],
 ['où', 'sommes', 'nous', '?'],
 ["d'", 'où', 'venons', 'nous', '?'],
 ['que', 'ferions', 'nous', 'sans', 'elle', '?']]

For each language: 
<br/>- Get vocabulary counter.
<br/>- Get vocabulary.
<br/>- Get dictionary that maps each word to an index.
<br/>- Transform tokens to their corresponding ids.

In [10]:
PAD = 0; SOS = 1

def tokens2ids(sentences):
    vocab_counter = collections.Counter(word for sent in sentences for word in sent)
    vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)
    vocab.insert(PAD, '<PAD>')
    vocab.insert(SOS, '<SOS')
    w2id = {word:i for i, word in enumerate(vocab)}
    ids = [[w2id[word] for word in sent] for sent in sentences]
    return vocab_counter, vocab, w2id, ids

In [11]:
en_vocab_counter, en_vocab, en_w2id, en_ids = tokens2ids(en_tokens)
fr_vocab_counter, fr_vocab, fr_w2id, fr_ids = tokens2ids(fr_tokens)

In [12]:
len(en_vocab), len(fr_vocab)

(19549, 26709)

## Word vectors

We will not train word vectors from scratch, we will use <a href='http://nlp.stanford.edu/data/glove.6B.zip'>GloVe</a> for english words and <a href='http://fauconnier.github.io/index.html#wordembeddingmodels'>FrWac2Vec</a> for french words.

#### GloVe preprocessing

We will:
<br/>- Load words.
<br/>- Assign an index to each word.
<br/>- Create a dictionary that maps each word to their ids.
<br/>- Create an array with numeric vectors.

In [13]:
glove_path = f'{PATH}/data/glove'

In [40]:
words = []
idx = 0
word2id = {}
word_vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/results/6B.100.dat', mode='w')

with open(f'{glove_path}/glove.6B.100d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2id[word] = idx
        idx += 1
        word_vector = np.array(line[1:]).astype(np.float)
        word_vectors.append(word_vector)
    #word_vectors.flush()

In [41]:
word_vectors = bcolz.carray(word_vectors[1:].reshape((400000, 100)), rootdir=f'{glove_path}/results/6B.100.dat', mode='w')
word_vectors.flush()
pickle.dump(words, open(f'{glove_path}/results/6B.100_words.pkl', 'wb'))
pickle.dump(word2id, open(f'{glove_path}/results/6B.100_idx.pkl', 'wb'))

#### Load english words vectors

In [14]:
# English words vectors from GloVe (numeric vectors).
glove_vectors = bcolz.open(f'{glove_path}/results/6B.100.dat')[:]
# English words from GloVe (list of strings).
glove_words = pickle.load(open(f'{glove_path}/results/6B.100_words.pkl', 'rb'))
# Dictionary that maps each english word from GloVe to their corresponding ids.
glove_word2id = pickle.load(open(f'{glove_path}/results/6B.100_idx.pkl', 'rb'))

We will create a dictionary that maps each glove english word to their corresponding numeric vectors.

In [15]:
glove_word2vec = {word: glove_vectors[glove_word2id[word]] for word in glove_words}
n_glove_vectors, dim_glove_vectors = glove_vectors.shape

In [16]:
n_glove_vectors, dim_glove_vectors

(400000, 100)

We have 400000 glove english words vectors with dimension equal to 100.

In [17]:
glove_word2vec['phone']

array([ 3.1764e-02, -6.0768e-01,  5.2233e-01, -1.1533e-02,  3.6009e-01,
        3.6460e-01, -4.9728e-03, -3.3769e-04,  6.6011e-01, -1.2602e-01,
        2.3832e-01,  5.6113e-02, -1.1328e-01,  3.5199e-01,  2.4070e-01,
       -2.9588e-01, -3.1811e-01,  7.9509e-03,  3.2996e-01, -1.0383e-01,
       -4.0230e-01, -3.7351e-03,  4.7088e-01,  2.2141e-01,  3.3043e-01,
       -4.5048e-01,  3.5376e-01,  5.5943e-01,  2.3509e-01,  5.0190e-02,
        5.7384e-01,  9.1137e-01,  8.9360e-01,  1.3000e-01,  6.7807e-01,
        4.1787e-01, -6.9812e-01, -6.0581e-01,  1.1147e+00, -4.3455e-03,
        4.6439e-01, -3.8663e-01,  9.2078e-02, -3.5278e-01, -9.2302e-01,
        3.7423e-02, -4.9481e-01, -2.0403e-01,  8.0609e-01, -6.7063e-01,
        1.9324e-01,  6.9329e-01,  8.1743e-01,  3.7762e-01,  2.6951e-01,
       -1.7669e+00, -7.0825e-01,  2.7024e-01,  1.9455e+00,  7.2376e-01,
        1.7558e-01, -1.7475e-01,  1.7004e-01, -6.7982e-01, -2.3057e-01,
        8.5733e-02,  7.8184e-01,  3.4410e-01,  8.3690e-01,  2.77

For instance this is 'phone' vector.

#### French word vectors

In [18]:
fr_w2v_path = f'{PATH}/data/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin'

In [19]:
fr_w2v = KeyedVectors.load_word2vec_format(fr_w2v_path, binary=True)

In [20]:
fr_w2v.vector_size

200

French word vectors dim is equal to 200.

Now we need to create embeddings matrices for english and french words of training corpus. If a word appears on GloVe or frWac then we load its pre-trained vector, otherwise we create a random vector.

In [21]:
def create_embedding(w2v, target_vocab, emb_dim):
    emb_len = len(target_vocab)
    embedding = np.zeros((emb_len, emb_dim))
    words_found = 0
    
    for i, w in enumerate(target_vocab):
        try: 
            embedding[i] = w2v[w]
            words_found += 1
        except KeyError:
            embedding[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    
    return embedding, words_found

In [22]:
en_emb, words_found = create_embedding(glove_word2vec, en_vocab, 100)

In [23]:
en_emb.shape, words_found

((19549, 100), 17251)

In [24]:
fr_emb, words_found = create_embedding(fr_w2v, fr_vocab, 200)

In [25]:
fr_emb.shape, words_found

((26709, 200), 21878)

## Data preparation

Min, max and mean length of english sentences.

In [26]:
len_en_ids = [len(sentence) for sentence in en_ids]
min(len_en_ids), max(len_en_ids), np.mean(len_en_ids)

(2, 202, 13.153904951176168)

Min, max and mean length of french sentences.

In [27]:
len_fr_ids = [len(sentence) for sentence in fr_ids]
min(len_fr_ids), max(len_fr_ids), np.mean(len_fr_ids)

(2, 189, 15.776442261756893)

We set 30 as max length

In [28]:
maxlen = 30

In [29]:
en_padded = pad_sequences(en_ids, maxlen, 'int64', 'post', 'post')
fr_padded = pad_sequences(fr_ids, maxlen, 'int64', 'post', 'post')

In [30]:
fr_padded.shape, en_padded.shape, en_emb.shape, fr_emb.shape

((52331, 30), (52331, 30), (19549, 100), (26709, 200))

We have: 
<br/>- 52331 english questions with their 52331 french translations (both with max length = 30).
<br/>- 19549 english words vectors with dim 100.
<br/>- 26709 french words vectors with dim 200.

We split dataset into 90% training - 10% test

In [31]:
en_train, en_test, fr_train, fr_test = train_test_split(en_padded, fr_padded, test_size=0.1)

In [32]:
[s.shape for s in [fr_train, fr_test, en_train, en_test]]

[(47097, 30), (5234, 30), (47097, 30), (5234, 30)]

For instance, this is the first english question in training set:

In [33]:
en_train[0]

array([  15,    8,    3,   73,    5,   41, 2699,    3,  134,  848,    6,
       1442, 1983,   13,  182,  102,  856,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0])

Each of those integers represent a word. We then look up into the embedding matrix to get their word vector.

## Model

The first approach is the creation of a simple sequence to sequence model.

### Seq2Seq

<br/>Encoder:
<br/>
<br/>Inputs: french words sentence and initial hidden state (all zeros).
<br/>1- Look up at an embedding layer to get word vector of each word of the input sentence. 
<br/>2- Pass the word vectors sequence through a RNN.
<br/>3- Return hidden state of last timestep (vector representation of input sentence).
<br/>
<br/>Decoder:
<br/>
<br/>Inputs: 'SOS' word (i.e. start of sentence, is always the first word) and vector representation created by encoder.
<br/>1- Load vector representation as initial hidden state.
<br/>2- Look up at an embedding layer to get word vector of 'SOS'.
<br/>3- Pass the word vector through a RNN.
<br/>4- Generate prediction of next word.
<br/>5- Repeat 2, 3 and 4 using always the previous translated word until finish sentence translation.

In [34]:
torch.cuda.is_available()

True

In [35]:
fr_emb_t = torch.FloatTensor(fr_emb).cuda()
en_emb_t = torch.FloatTensor(en_emb).cuda()

In [36]:
def long_t(arr):
    return Variable(torch.LongTensor(arr)).cuda()

Load pre-trained vectors into an embedding layer.

In [37]:
def create_emb(emb_matrix, non_trainable=False):
    num_embeddings, embedding_dim = emb_matrix.size()
    emb = nn.Embedding(num_embeddings, embedding_dim)
    emb.load_state_dict({'weight': emb_matrix})
    if non_trainable:
        #emb.weight.requires_grad = False
        for param in emb.parameters():
            param.requires_grad = False
    return emb, num_embeddings, embedding_dim

Encoding layer

In [105]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_matrix, hidden_size, num_layers=2):
        super(EncoderRNN, self).__init__()
        # Create embedding layer.
        self.embedding, num_embeddings, embedding_dim = create_emb(emb_matrix, True)
        # Create RNN.
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        
    def forward(self, inp, hidden):
        return self.gru(self.embedding(inp), hidden)
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))

Decoding layer

In [116]:
class DecoderRNN(nn.Module):
    def __init__(self, emb_matrix, hidden_size, num_layers=2):
        super(DecoderRNN, self).__init__()
        # Create embedding layer.
        self.emb, num_embeddings, embedding_dim = create_emb(emb_matrix)
        # Create RNN.
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=False)
        self.out = nn.Linear(hidden_size, num_embeddings)
        
    def forward(self, inp, hidden):
        emb = self.emb(inp).unsqueeze(1)
        res, hidden = self.gru(emb, hidden)
        # Softmax layer, generates probs for each word vector of the embedding layer.
        res = F.log_softmax(self.out(res[:,0]), dim=1)
        return res, hidden

In [40]:
def encode(inp, encoder):
    batch_size, input_length = inp.size()
    hidden = encoder.init_hidden(batch_size).cuda()
    enc_outputs, hidden = encoder.forward(inp, hidden)
    return long_t([SOS]*batch_size), enc_outputs, hidden

Training

We use teaching-force as training approach. Rather than pass to decoder the previous translated word, we pass the real target.

In [127]:
def fit(encoder, decoder, train_dl, valid_dl, n_epochs, enc_optim, dec_optim, criterion):
    bar = tqdm_notebook(total=n_epochs)
    
    avg_mom = 0.98
    avg_loss = 0.
    batch_num = 0

    for epoch in range(n_epochs):
        bar2 = tqdm_notebook(total=train_dl.dataset.shape[0] / train_dl.batch_size, desc=f'Epoch {epoch}', leave=False)
        for i, batch in enumerate(train_dl):
            batch_num += 1
            loss = 0
            
            inp = long_t(batch[:, :30])
            targ = long_t(batch[:, 30:])
        
            # Encoder creates a vector representation of input french sentence. 
            decoder_input, encoder_output, hidden = encode(inp, encoder)

            # Zero the gradients before running the backward pass.
            enc_optim.zero_grad()
            dec_optim.zero_grad()
            
            targ_length = targ.size()[1]
            
            for di in range(targ_length):
                decoder_output, hidden = decoder(decoder_input, hidden)
                # Teacher forcing: the decoder receives as input the real target instead of predicted word.
                decoder_input = targ[:, di]
                # Compute loss.
                loss += criterion(decoder_output, decoder_input)

            # Backward pass: compute gradient of the loss with respect to all the learnable parameters of the model.
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters.
            enc_optim.step()
            dec_optim.step()
            
            # Exponentially weighted moving average, to make the reported loss more stable.
            avg_loss = avg_loss * avg_mom + (loss.data[0] / targ_length)  * (1-avg_mom)
            
            # Compute bias-corrected loss estimate.
            debias_loss = avg_loss / (1 - avg_mom**batch_num)
            
            bar2.update()
            
        # Compute validation loss.
        #val = validate(model, valid_dl, criterion)
        
        print(np.round([epoch, debias_loss], 6))    
        bar.update()

In [128]:
def req_grad_params(o):
    return (param for param in o.parameters() if param.requires_grad)

In [129]:
hidden_size = 128
encoder = EncoderRNN(fr_emb_t, hidden_size).cuda()
decoder = DecoderRNN(en_emb_t, hidden_size).cuda()

In [130]:
lr = 1e-2

In [131]:
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.NLLLoss().cuda()

In [132]:
batch_size = 64

In [133]:
train_dl = DataLoader(np.concatenate([fr_train, en_train], 1), batch_size, shuffle=True, num_workers=1)
valid_dl = DataLoader(np.concatenate([fr_test, en_test], 1), batch_size * 2, shuffle=False, num_workers=1)

In [134]:
fit(encoder, decoder, train_dl, valid_dl, 15, enc_opt, dec_opt, criterion)

[0.       2.029788]


[1.       1.828452]


[2.       1.687602]


[3.      1.60677]


[4.       1.502757]


[5.       1.457319]


[6.       1.445323]


[7.       1.418944]


[8.       1.384115]


[9.       1.361875]


[10.        1.370231]


[11.        1.333297]


[12.        1.312195]


[13.        1.317822]


[14.        1.332063]


In order to generate predictions of a french sentence:
<br/>1- Tokenize.
<br/>2- Transform words to their ids.
<br/>3- Set sentence length = 30.
<br/>3- Encode.
<br/>4- Decode next translated word until the decoder generates a special word that means end of sentence or until reach the max length = 30.

In [None]:
def sent2ids(sent):
    ids = [fr_w2id[t] for t in tokenize(sent)]
    return pad_sequences([ids], maxlen, 'int64', 'post', 'post')

In [75]:
def evaluate(inp):
    decoder_input, encoder_outputs, hidden = encode(inp, encoder)
    target_length = maxlen
    
    decoded_words = []
    for di in range(target_length):
        decoder_output, hidden = decoder(decoder_input, hidden)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni==PAD:
            break
        decoded_words.append(en_vocab[ni])
        decoder_input = long_t([ni])
    
    return decoded_words

In [117]:
def fr2en(sent):
    ids = long_t(sent2ids(sent))
    translation = evaluate(ids)
    return ' '.join(translation)

In [156]:
i = 2
fr_questions[i], en_questions[i], fr2en(fr_questions[i])

("D'où venons-nous?", 'Where did we come from?', 'where do we come from ?')

In [164]:
i = 21
fr_questions[i], en_questions[i], fr2en(fr_questions[i])

('Quelle est la densité de population au Canada ?',
 "What is Canada's population density?",
 'what is the size of the population of canadians ?')

In [165]:
i = 32
fr_questions[i], en_questions[i], fr2en(fr_questions[i])

('Quels en sont les avantages pour moi?',
 "What's in it for me?",
 'what are the benefits of joining ?')

Some short sentences as above examples show reasonable translations, but for long senteces the performance is much worst. This could be improved using attention and will be the next step.

In [172]:
torch.save(encoder.state_dict(), f'{PATH}/results/simple_encoder.pth')
torch.save(decoder.state_dict(), f'{PATH}/results/simple_decoder.pth')