## Pytorch Implementation of Seq2Seq with RNN (for Translation)

In [9]:
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import Transformer
from torchtext import data
import spacy
import math

In [2]:
# Read in the dataset - note that we need to specify encoding="utf-8" when the language contains non ascii words.
sentences_english = []
sentences_spanish = []
for line in open('../datasets/spa.txt', 'r', encoding = 'utf-8'):
    s_english, s_spanish, other = line.rstrip('\n').split('\t')
    sentences_english.append(s_english)
    sentences_spanish.append(s_spanish)   

sentences_english = np.array(sentences_english)
sentences_spanish = np.array(sentences_spanish)
# print to check
print(sentences_english[0:10])
print()
print(sentences_spanish[0:10])
print()
print('In total: ' + str(len(sentences_spanish)) + ' pairs of sentences.')

# The original data is quite large, and may result in high memory usage and long training time. Let's take a sample of 15000
idx = np.random.choice(list(range(len(sentences_spanish))), size = 15000, replace = False)
sentences_english = sentences_english[idx]
sentences_spanish = sentences_spanish[idx]

['Go.' 'Go.' 'Go.' 'Go.' 'Hi.' 'Run!' 'Run!' 'Run!' 'Run!' 'Run.']

['Ve.' 'Vete.' 'Vaya.' 'Váyase.' 'Hola.' '¡Corre!' '¡Corran!' '¡Corra!'
 '¡Corred!' 'Corred.']

In total: 128084 pairs of sentences.


In [5]:
# text preprocessing and vectorization for English-Spanish sentence pairs
TEXT_eng = data.Field(sequential=True, init_token = '<start>', eos_token = '<end>', tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True, batch_first=True)
TEXT_spa = data.Field(sequential=True, init_token = '<start>', eos_token = '<end>', tokenize='spacy', tokenizer_language='es_core_news_sm', lower=True, batch_first=True)
fields = [('English', TEXT_eng), ('Spanish', TEXT_spa)]
examples = []
for i in range(len(sentences_english)):
    examples.append(data.Example.fromlist([sentences_english[i], sentences_spanish[i]], fields))
dataset = data.Dataset(examples, fields)
TEXT_eng.build_vocab(dataset)
TEXT_spa.build_vocab(dataset)


In [6]:
# inspect the vocabulary
print(len(TEXT_eng.vocab))
print(TEXT_eng.vocab.freqs.most_common(10))
print(TEXT_eng.vocab.itos[:10])

print(len(TEXT_spa.vocab))
print(TEXT_spa.vocab.freqs.most_common(10))
print(TEXT_spa.vocab.itos[:10])

5909
[('.', 13004), ('i', 4342), ('the', 3471), ('to', 3303), ('you', 3140), ('tom', 2552), ('a', 2238), ('?', 2028), ("n't", 1870), ('is', 1845)]
['<unk>', '<pad>', '<start>', '<end>', '.', 'i', 'the', 'to', 'you', 'tom']
9682
[('.', 12962), ('que', 2752), ('de', 2748), ('no', 2579), ('a', 2563), ('tom', 2420), ('la', 2283), ('¿', 2027), ('?', 2027), ('el', 1826)]
['<unk>', '<pad>', '<start>', '<end>', '.', 'que', 'de', 'no', 'a', 'tom']


In [7]:
# construct train_iterator and valid_iterator
# each iterator should constain pairs of Enblish sentences and Spanish sentences
train_data, valid_data = dataset.split(split_ratio=0.8)
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=128,
                                                            sort_key=lambda x: len(x.Spanish),
                                                            sort_within_batch=False)


### Seq2Seq with Transformer

In [34]:
# First, we need to define the positional encoding layer
# The positional encoding layer is used to add positional information to the input embeddings.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """
        :param d_model: the embedding dimension
        :param max_len: the maximum length of the sentence
        """
        super(PositionalEncoding, self).__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)  # pe.shape = (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # position.shape = (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))  # div_term.shape = (d_model/2, )
        pe[:, 0::2] = torch.sin(position * div_term)  # pe[:, 0::2].shape = (max_len, d_model/2)
        pe[:, 1::2] = torch.cos(position * div_term)  # pe[:, 1::2].shape = (max_len, d_model/2)
        pe = pe.unsqueeze(0)  # pe.shape = (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        :param x: the input tensor with shape (batch_size, seq_len, d_model)
        :return: the tensor after adding positional encoding with shape (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1)]
        return x

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
# https://pytorch.org/tutorials/beginner/translation_transformer.html#seq2seq-network-using-transformer
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        """
        :param vocab_size: the size of the vocabulary
        :param d_model: the embedding dimension
        """
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        """
        :param tokens: the input tensor with shape (batch_size, seq_len)
        :return: the tensor after token embedding with shape (batch_size, seq_len, d_model)
        """
        return self.embedding(tokens.long()) * math.sqrt(self.d_model)

# Second, we need to define the transformer layer
class Seq2SeqTransformer(nn.Module):
    def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, src_vocab_size, tgt_vocab_size):
        """
        :param d_model: the embedding dimension
        :param nhead: the number of heads in the multiheadattention models
        :param num_encoder_layers: the number of sub-encoder-layers in the encoder
        :param num_decoder_layers: the number of sub-decoder-layers in the decoder
        :param dim_feedforward: the dimension of the feedforward network model
        """
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, batch_first=True)
        self.generator = nn.Linear(d_model, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, d_model)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        """
        :param src: the sequence to the encoder (required). with shape (batch_size, seq_len, d_model)
        :param tgt: the sequence to the decoder (required). with shape (batch_size, seq_len, d_model)
        :param src_mask: the additive mask for the src sequence (optional). with shape (batch_size, seq_len, seq_len)
        :param tgt_mask: the additive mask for the tgt sequence (optional). with shape (batch_size, seq_len, seq_len)
        :param src_padding_mask: the additive mask for the src sequence (optional). with shape (batch_size, 1, seq_len)
        :param tgt_padding_mask: the additive mask for the tgt sequence (optional). with shape (batch_size, 1, seq_len)
        :param memory_key_padding_mask: the additive mask for the encoder output (optional). with shape (batch_size, 1, seq_len)
        :return: the decoder output tensor with shape (batch_size, seq_len, d_model)
        """
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)
    
    # Next, we construct the encoder and decoder layers
    # The transformer model is a standard encoder-decoder architecture with multi-head attention.
    def encode(self, src):
        """
        :param src: the sequence to the encoder (required). with shape (batch_size, seq_len, d_model)
        :return: the encoder output tensor with shape (batch_size, seq_len, d_model)
        """
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)))
    
    def decode(self, tgt, memory):
        """
        :param tgt: the sequence to the decoder (required). with shape (batch_size, seq_len, d_model)
        :param memory: the sequence from the last layer of the encoder (required). with shape (batch_size, seq_len, d_model)
        :return: the decoder output tensor with shape (batch_size, seq_len, d_model)
        """
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory)


In [40]:
# specify model parameters and training parameters
SRC_VOCAB_SIZE = len(TEXT_eng.vocab)
TGT_VOCAB_SIZE = len(TEXT_spa.vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

model = Seq2SeqTransformer(EMB_SIZE, NHEAD, NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, FFN_HID_DIM, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index = TEXT_spa.vocab.stoi[TEXT_spa.pad_token])

In [41]:
# train the model and print out validation loss after each epoch
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_iterator:
        src = batch.English
        tgt = batch.Spanish
        optimizer.zero_grad()
        output = model(src, tgt, None, None, None, None, None)
        output = output.reshape(-1, output.shape[2])
        tgt = tgt.reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch} loss: {epoch_loss/len(train_iterator)}")
    
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in valid_iterator:
            src = batch.English
            tgt = batch.Spanish
            output = model(src, tgt, None, None, None, None, None)
            output = output.reshape(-1, output.shape[2])
            tgt = tgt.reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    print(f"Epoch {epoch} validation loss: {epoch_loss/len(valid_iterator)}")

Epoch 0 loss: 2.7334858242501605
Epoch 0 validation loss: 1.0722072223822277
Epoch 1 loss: 0.6545650594412012
Epoch 1 validation loss: 0.5788673646748066
Epoch 2 loss: 0.23410689228392662
Epoch 2 validation loss: 0.4507502367099126
Epoch 3 loss: 0.052486242607552955
Epoch 3 validation loss: 0.4169566494723161
Epoch 4 loss: 0.005499295015977894
Epoch 4 validation loss: 0.42874907702207565
Epoch 5 loss: 0.0029417891525960666
Epoch 5 validation loss: 0.4334271351496379
Epoch 6 loss: 0.002171048550232452
Epoch 6 validation loss: 0.4338444421688716
Epoch 7 loss: 0.0016843575434362951
Epoch 7 validation loss: 0.433768833676974
Epoch 8 loss: 0.0013666233540750405
Epoch 8 validation loss: 0.4354255435367425
Epoch 9 loss: 0.0011327030308088883
Epoch 9 validation loss: 0.4363349253932635


In [42]:
# Finally, implement the translate function to translate English to Spanish
def translate(sentence, src_field=TEXT_eng, trg_field=TEXT_spa, model=model, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        nlp = spacy.load('en_core_web_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0)
    with torch.no_grad():
        encoder_outputs = model.encode(src_tensor)
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0)
        with torch.no_grad():
            output = model.decode(trg_tensor, encoder_outputs)
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    return trg_tokens[1:]

In [45]:
translate('Hello world!')

['volver',
 'cuánto',
 'tom',
 'mundo',
 'hablar',
 'ni',
 'debes',
 'pregunto',
 'dicho',
 'fuerte',
 'volver',
 'pasar',
 'estado',
 'cansado',
 'era',
 'a',
 'recuerdo',
 'última',
 'escuela',
 'otro',
 'nosotros',
 'jugar',
 '¿',
 'ayudar',
 'ningún',
 'debe',
 'lleva',
 'vi',
 'noche',
 'gracias',
 'tener',
 'madre',
 'policía',
 'nosotros',
 'debes',
 'a',
 'recuerdo',
 'ni',
 'haga',
 'visto',
 'preguntas',
 'favor',
 'vive',
 'pensar',
 'lugar',
 'importa',
 'un',
 'dicho',
 'fuerte',
 'mucho']