# Machine Translation with Seq2Seq
Based on [Aladdin Persson](https://www.youtube.com/@AladdinPersson)'s [Tutorial](https://www.youtube.com/watch?v=EoGUlvhRYpk)

### Downloads and imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k # German to English dataset
"""
Had a problem with downloading the dataset due to down servers. Had to substitute the urls with the ones below.

urls = ['https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt_task1_test2016.tar.gz']
"""


from torchtext.data import Field, BucketIterator
import numpy as np
import spacy # Tokenizer
import random
from torch.utils.tensorboard.writer import SummaryWriter  # to print to tensorboard

In [2]:
DEVICE = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {DEVICE} device")

Using cuda device


### Utils

Imported from [Aladdin Persson](https://www.youtube.com/@AladdinPersson)'s [Utils File](https://github.com/aladdinpersson/Machine-Learning-Collection/blob/558557c7989f0b10fee6e8d8f953d7269ae43d4f/ML/Pytorch/more_advanced/Seq2Seq/utils.py)

In [3]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load('de_core_news_sm') # changed from de to de_core_news_sm

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

### Data

In [4]:
spacy_de = spacy.load('de_core_news_sm') # German tokenizer
spacy_en = spacy.load('en_core_web_sm') # English tokenizer

In [5]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
deutsch = Field(tokenize=tokenize_de, lower=True,
                init_token='<sos>', eos_token='<eos>')

english = Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(deutsch, english))

In [8]:
deutsch.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1, dropout_p=0.):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)
        # hidden shape: (num_layers, N, hidden_size)
        # cell shape: (num_layers, N, hidden_size)
        
        return hidden, cell

In [10]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size,
                 output_size, num_layers=1, dropout_p=0.):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        x = x.unsqueeze(0)
        # x shape: (1, N)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)
        # hidden shape: (num_layers, N, hidden_size)
        # cell shape: (num_layers, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape: (1, N, length_target_vocabulary) to send it to loss function
        predictions = predictions.squeeze(0)
        # predictions shape: (N, length_target_vocabulary)
        
        return predictions, hidden, cell

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        # source shape: (seq_length, N) where N is batch size
        # teacher_force_ratio: how often the model uses its own predictions
        # during training versus relying on the target output

        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(DEVICE)
        # outputs shape: (target_len, batch_size, target_vocab_size)

        hidden, cell = self.encoder(source)
        # hidden shape: (num_layers, N, hidden_size)
        # cell shape: (num_layers, N, hidden_size)

        x = target[0] # Grab start token <SOS>
        # x shape: (N)

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            # output shape: (batch_size, target_vocab_size)
            outputs[t] = output
            best_guess = output.argmax(1) # argmax of output on dimension 1

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

Training Hyperparameters

In [12]:
num_epochs = 50
learning_rate = 1e-3
batch_size = 64

Model Hyperparameters

In [13]:
load_model = False
input_size_encoder = len(deutsch.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

Tensorboard

In [14]:
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [15]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device=DEVICE)

In [16]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                      hidden_size, num_layers=num_layers, dropout_p=enc_dropout).to(DEVICE)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                      hidden_size, output_size, num_layers=num_layers, dropout_p=dec_dropout).to(DEVICE)

In [17]:
model = Seq2Seq(encoder_net, decoder_net).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [19]:
if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.tar'), model, optimizer)

(model, sentence, german, english, device, max_length=50)

In [20]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [21]:
for epoch in range(num_epochs):
    print(f'Epoch [{epoch} / {num_epochs}]')

    checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(model, sentence, deutsch, english, DEVICE)

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(DEVICE)
        target = batch.trg.to(DEVICE)

        output = model(inp_data, target)
        # output shape: (trg_len, batch_size, output_dim)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1

Epoch [0 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['sells', 'dancing', 'streaming', 'woodworking', 'mac', 'gas', 'gas', 'throws', 'throws', 'throws', 'offers', 'offers', 'letters', 'sodas', 'trampoline', 'letters', 'looking', 'pane', 'after', 'after', 'soda', 'soda', 'shines', 'demonstration', 'thought', 'oven', 'sporting', 'oven', 'building', 'tomato', 'sporting', 'die', 'die', 'ring', 'blanket', 'blanket', 'dancing', 'site', 'waving', 'gentlemen', 'paddles', 'after', 'after', 'inner', 'tournament', 'cloth', 'gambling', 'maneuvers', 'streaming', 'snowflakes']
Epoch [1 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'couple', 'in', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch [2 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'baseball', 'player', 'with', 'a', '<unk>', '<unk>', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch [3 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'football', 'pl

In [23]:
score = bleu(test_data[1:100], model, deutsch, english, DEVICE)
print(f"Bleu score {score*100:.2f}")

Bleu score 19.27
