## Pytorch Implementation of Seq2Seq with RNN (for Translation)

In [21]:
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchtext import data
import spacy

In [7]:
# Read in the dataset - note that we need to specify encoding="utf-8" when the language contains non ascii words.
sentences_english = []
sentences_spanish = []
for line in open('../datasets/spa.txt', 'r', encoding = 'utf-8'):
    s_english, s_spanish, other = line.rstrip('\n').split('\t')
    sentences_english.append(s_english)
    sentences_spanish.append(s_spanish)   

sentences_english = np.array(sentences_english)
sentences_spanish = np.array(sentences_spanish)
# print to check
print(sentences_english[0:10])
print()
print(sentences_spanish[0:10])
print()
print('In total: ' + str(len(sentences_spanish)) + ' pairs of sentences.')

# The original data is quite large, and may result in high memory usage and long training time. Let's take a sample of 15000
idx = np.random.choice(list(range(len(sentences_spanish))), size = 15000, replace = False)
sentences_english = sentences_english[idx]
sentences_spanish = sentences_spanish[idx]

['Go.' 'Go.' 'Go.' 'Go.' 'Hi.' 'Run!' 'Run!' 'Run!' 'Run!' 'Run.']

['Ve.' 'Vete.' 'Vaya.' 'Váyase.' 'Hola.' '¡Corre!' '¡Corran!' '¡Corra!'
 '¡Corred!' 'Corred.']

In total: 128084 pairs of sentences.


In [8]:
# text preprocessing and vectorization for English-Spanish sentence pairs
TEXT_eng = data.Field(sequential=True, init_token = '<start>', eos_token = '<end>', tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True, batch_first=True)
TEXT_spa = data.Field(sequential=True, init_token = '<start>', eos_token = '<end>', tokenize='spacy', tokenizer_language='es_core_news_sm', lower=True, batch_first=True)
fields = [('English', TEXT_eng), ('Spanish', TEXT_spa)]
examples = []
for i in range(len(sentences_english)):
    examples.append(data.Example.fromlist([sentences_english[i], sentences_spanish[i]], fields))
dataset = data.Dataset(examples, fields)
TEXT_eng.build_vocab(dataset)
TEXT_spa.build_vocab(dataset)


In [9]:
# inspect the vocabulary
print(len(TEXT_eng.vocab))
print(TEXT_eng.vocab.freqs.most_common(10))
print(TEXT_eng.vocab.itos[:10])

print(len(TEXT_spa.vocab))
print(TEXT_spa.vocab.freqs.most_common(10))
print(TEXT_spa.vocab.itos[:10])

5947
[('.', 13045), ('i', 4387), ('the', 3543), ('to', 3435), ('you', 3083), ('tom', 2535), ('a', 2209), ('?', 1989), ("n't", 1876), ('is', 1875)]
['<unk>', '<pad>', '<start>', '<end>', '.', 'i', 'the', 'to', 'you', 'tom']
9796
[('.', 12990), ('de', 2868), ('que', 2752), ('no', 2597), ('a', 2498), ('tom', 2428), ('la', 2251), ('¿', 1992), ('?', 1992), ('el', 1934)]
['<unk>', '<pad>', '<start>', '<end>', '.', 'de', 'que', 'no', 'a', 'tom']


In [17]:
# construct train_iterator and valid_iterator
# each iterator should constain pairs of Enblish sentences and Spanish sentences
train_data, valid_data = dataset.split(split_ratio=0.8)
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=128,
                                                            sort_key=lambda x: len(x.Spanish),
                                                            sort_within_batch=False)


### Seq2Seq with RNN

In [18]:
# define a encoder-decoder model with RNN to translate English to Spanish
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        
    def forward(self, src):
        # src = [batch size, src len]
        embedded = self.embedding(src)
        # embedded = [batch size, src len, emb dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [batch size, src len, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        return hidden
    
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
    def forward(self, input, hidden):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        input = input.unsqueeze(1)
        # input = [batch size, 1]
        embedded = self.embedding(input)
        # embedded = [batch size, 1, emb dim]
        output, hidden = self.rnn(embedded, hidden)
        # output = [batch size, 1, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        prediction = self.fc_out(output.squeeze(1))
        # prediction = [batch size, output dim]
        return prediction, hidden
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg):
        # src = [batch size, src len]
        # trg = [batch size, trg len]
        # teacher_forcing_ratio is probability to use teacher forcing
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = len(TEXT_spa.vocab)
        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
        # first input to the decoder is the <start> tokens
        input = trg[:, 0]
        for t in range(1, trg_len):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden)
            # place predictions in a tensor holding predictions for each token
            outputs[:, t, :] = output
            # under teacher forcing, use actual next token as next input
            input = trg[:, t]
        return outputs


In [19]:
# specify model parameters and training parameters
INPUT_DIM = len(TEXT_eng.vocab)
OUTPUT_DIM = len(TEXT_spa.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM)
model = Seq2Seq(enc, dec)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index = TEXT_spa.vocab.stoi[TEXT_spa.pad_token])


In [20]:
# train the model and print out validation loss after each epoch
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_iterator:
        src = batch.English
        trg = batch.Spanish
        optimizer.zero_grad()
        output = model(src, trg)
        # output = [batch size, trg len, output dim]
        # trg = [batch size, trg len]
        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        # output = [batch size * trg len - 1, output dim]
        # trg = [batch size * trg len - 1]
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print('Epoch: ' + str(epoch) + ' Loss: ' + str(epoch_loss / len(train_iterator)))
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in valid_iterator:
            src = batch.English
            trg = batch.Spanish
            output = model(src, trg)
            # output = [batch size, trg len, output dim]
            # trg = [batch size, trg len]
            output_dim = output.shape[-1]
            output = output[:, 1:, :].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            # output = [batch size * trg len - 1, output dim]
            # trg = [batch size * trg len - 1]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    print('Epoch: ' + str(epoch) + ' Validation Loss: ' + str(epoch_loss / len(valid_iterator)))

Epoch: 0 Loss: 5.496793838257485
Epoch: 0 Validation Loss: 4.830464164415996
Epoch: 1 Loss: 4.3519443095998565
Epoch: 1 Validation Loss: 4.229296763737996
Epoch: 2 Loss: 3.659259408078295
Epoch: 2 Validation Loss: 3.9169470767180123
Epoch: 3 Loss: 3.141818868353012
Epoch: 3 Validation Loss: 3.706241011619568
Epoch: 4 Loss: 2.7004515693542803
Epoch: 4 Validation Loss: 3.600416898727417
Epoch: 5 Loss: 2.3049231666199703
Epoch: 5 Validation Loss: 3.535680582125982
Epoch: 6 Loss: 1.9472848227683535
Epoch: 6 Validation Loss: 3.503709683815638
Epoch: 7 Loss: 1.6276886120755623
Epoch: 7 Validation Loss: 3.4893744190533957
Epoch: 8 Loss: 1.3456305506381583
Epoch: 8 Validation Loss: 3.4803479512532554
Epoch: 9 Loss: 1.0968241171633943
Epoch: 9 Validation Loss: 3.5109322170416513


In [31]:
# Finally, implement the translate function to translate English to Spanish
def translate(sentence, src_field=TEXT_eng, trg_field=TEXT_spa, model=model, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        nlp = spacy.load('en_core_web_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0)
    # src_tensor = [src_len, batch size]
    with torch.no_grad():
        hidden = model.encoder(src_tensor)
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]])
        # trg_tensor = [1, batch size]
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)
        # output = [batch size, output dim]
        pred_token = output.argmax(1).item()
        # Notice that because trg_tensor is set to the last element of trg_indexes, in every iteration the predicted token is added back to generate the next prediction
        trg_indexes.append(pred_token)
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    return trg_tokens[1:]

In [34]:
translate('Hello world!')

['¡', 'hola', 'a', 'todos', '!', '<end>']