# Sequence to Sequence Learning for NMT

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from tensorboardX import SummaryWriter
from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from tqdm import tqdm, trange

In [2]:
writer = SummaryWriter()
cuda = torch.cuda.is_available()
print(cuda)

True


## Data Preparation

In [3]:
class TranslationData(Dataset):
    def __init__(self, file_path, reverse=False):
        super(TranslationData, self).__init__()
        with open(file_path) as data_file:
            cleaned_data = data_file.read().lower().translate({**{ord(c): None for c in '.,;"!?'} , **{ord(c): ' ' for c in '-'}, **{ord(c): "' " for c in "'"}})
            self.pairs = [line.strip().split('\t') for line in cleaned_data.splitlines()]
            self.pairs = self.pairs[:50000] # Truncate data
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        return self.pairs[index]

In [4]:
data = TranslationData("data/fra.txt")
print(len(data))
print(data[0:5])
print(data[40000])

50000
[['go', 'va'], ['run', 'cours'], ['run', 'courez'], ['wow', 'ça alors'], ['fire', 'au feu']]
["his idea wasn' t usable", "son idée n' était pas exploitable"]


In [5]:
class Language():
    def __init__(self, lines):
        self.indices = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.words = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.count = 4
        
        for line in lines:
            for word in line.split():
                if word not in self.indices:
                    self.indices[word] = self.count
                    self.words[self.count] = word
                    self.count += 1
    
    def __len__(self):
        return self.count

In [6]:
english = Language([sentence[0] for sentence in data])
french = Language([sentence[1] for sentence in data])
print(len(english), len(french))

6070 10952


## Modelling

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, src, trg):
        super(Seq2Seq, self).__init__()
        EMB_SIZE = 128
        H_SIZE = 256
        LAYERS = 2
        self.src_emb = nn.Embedding(src.count, EMB_SIZE)
        self.trg_emb = nn.Embedding(trg.count, EMB_SIZE)
        self.encoder = nn.GRU(EMB_SIZE, H_SIZE, LAYERS, batch_first=True)
        self.decoder = nn.GRU(EMB_SIZE, H_SIZE, LAYERS, batch_first=True)
        self.to_trg = nn.Linear(H_SIZE, trg.count)
    
    def forward(self, src_sen_ids, trg_sen_ids):
        src_sen_emb = self.src_emb(src_sen_ids).unsqueeze(0)
        enc_output, enc_hidden = self.encoder(src_sen_emb)
        trg_sen_emb = self.trg_emb(trg_sen_ids).unsqueeze(0)
        dec_output, dec_hidden = self.decoder(trg_sen_emb, enc_hidden)
        preds = F.log_softmax(self.to_trg(dec_output.squeeze(0)), dim=1) 
        return preds

In [8]:
def prepare_sentence(sentence, language, sos=False, eos=False, reverse=False, verbose=False):
    sentence = [word if word in language.indices else "<UNK>" for word in sentence.split()]
    if sos: sentence = ["<SOS>"] + sentence
    if eos: sentence = sentence + ["<EOS>"]
    if verbose: print(sentence)
    if reverse: sentence = sentence[::-1]
    ids = [language.indices[w] for w in sentence]
    ids_var = Variable(torch.LongTensor(ids))
    if cuda: ids_var = ids_var.cuda()
    return ids_var

## Training

In [9]:
eng_fr_model = Seq2Seq(english, french)
if cuda: eng_fr_model.cuda()
optimizer = optim.Adam(eng_fr_model.parameters())
criterion = nn.NLLLoss()

In [10]:
for epoch in range(3):
    for i, pair in enumerate(tqdm(data)):
        src_sen = prepare_sentence(pair[0], english, sos=True, reverse=True)
        trg_sen_teacher = prepare_sentence(pair[1], french, sos=True)
        trg_sen = prepare_sentence(pair[1], french, eos=True)
        preds = eng_fr_model(src_sen, trg_sen_teacher)
        loss = criterion(preds, trg_sen)
        writer.add_scalar('data/loss', loss.data[0], i)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(eng_fr_model.parameters(), 5.0)
        optimizer.step()

100%|██████████| 50000/50000 [09:36<00:00, 86.67it/s]
100%|██████████| 50000/50000 [09:34<00:00, 87.00it/s]
100%|██████████| 50000/50000 [09:37<00:00, 86.60it/s]


## Evaluation

In [11]:
def evaluate(i):
    pair = data[i]
    english_sentence = prepare_sentence(pair[0], english, sos=True, reverse=True, verbose=True)
    french_sentence = prepare_sentence(pair[1], french, sos=True, verbose=True)
    preds = eng_fr_model(english_sentence, french_sentence)
    print([french.words[ids[0]] for ids in preds.topk(1)[1].data])

def evaluate_sample(size):
    import random
    for i in random.sample(range(50000), size):
        evaluate(i)
        print()

In [12]:
evaluate_sample(10)

['<SOS>', 'it', 'is', 'i', 'who', 'am', 'to', 'blame']
['<SOS>', "c'", 'est', 'moi', 'qui', 'suis', 'à', 'blâmer']
["c'", 'est', 'tout', 'avec', 'y', "l'", 'la', '<EOS>']

['<SOS>', 'i', 'have', 'my', 'own', 'room']
['<SOS>', 'je', 'dispose', 'de', 'ma', 'propre', 'chambre']
['je', 'ma', 'que', 'ma', 'chambre', 'chambre', '<EOS>']

['<SOS>', 'be', 'calm']
['<SOS>', 'sois', 'calme']
['soyez', 'sont', '<EOS>']

['<SOS>', 'the', 'door', 'blew', 'shut']
['<SOS>', 'la', 'porte', 'claqua', 'dans', 'un', 'souffle']
['la', 'voiture', 'ont', '<EOS>', 'la', 'moment', '<EOS>']

['<SOS>', 'i', 'doubt', 'if', 'he', 'is', 'honest']
['<SOS>', 'je', 'doute', "qu'", 'il', 'soit', 'honnête']
['je', "n'", "qu'", 'il', 'est', 'honnête', '<EOS>']

['<SOS>', 'i', 'rang', 'the', 'bell']
['<SOS>', "j'", 'ai', 'fait', 'sonner', 'la', 'cloche']
['je', 'ai', 'je', 'quelque', 'le', 'raison', '<EOS>']

['<SOS>', 'i', 'turned', 'on', 'the', 'radio']
['<SOS>', "j'", 'ai', 'allumé', 'la', 'radio']
["j'", 'ai', 'radio