In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random

from utils import translate_sentence, save_checkpoint, load_checkpoint

In [2]:
 torch.cuda.is_available()

True

## **Loading the dataset**

In [3]:
import spacy.cli
import en_core_web_sm
import de_core_news_sm


spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")


spacy_de = de_core_news_sm.load()
spacy_eng = en_core_web_sm.load()

[38;5;3m⚠ Skipping model package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the model dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the model package has
other dependencies, you'll have to install them manually.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;3m⚠ Skipping model package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the model dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the model package has
other dependencies, you'll have to install them manually.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


----
<font size="3"> With the help of spacy tokenizer, the tokenizer for the english and the german language are created. The tokenizer simply splits the individual words of a sentence into an array. A costum tokenizer here was used instead of the python `.split(" ")` because in some particular a distinct word is not defined by space e.g. punction, possessive s ...

In [4]:
def tokenizer_de(text):
  return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenizer_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [5]:
tokenizer_eng("I ate my friends's O'neal apple yesterday")

['I', 'ate', 'my', 'friends', "'s", "O'neal", 'apple', 'yesterday']

<font size="3"> A field is a place holder for the dataset that we'll be loading; using it, we specify special tokens (beginning and end tokens). </font>

In [6]:
german = Field(tokenize=tokenizer_de, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

<font size="3">This loads the Multi30k machine translation dataset [link](https://github.com/multi30k/dataset). The return value is three objects containing the train, validation, and test data with the respective size 29000, 1014, 1000. Each object contains two attributes, `.src` contains an array of the original phrase in English and `.trg` contains the  target phrase in German. These words are later to be encoded.</font>

In [7]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

<font size="3">Build the vocabulary object for both languages. Writing `english.vocab` will this yield this object. It contains a list of frequencies of every word extracted from the original dataset (`english.vocab.freqs`), A dictionary mapping every key to a word, and a dictionary mapping every word to a key (`english.vocab.stoi` and `english.vocab.itos`)</font>

In [8]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [9]:
#vocab.freqs in a python counter datatype
print("Five most common words in the dataset: " + str(english.vocab.freqs.most_common(5)))

Five most common words in the dataset: [('a', 49165), ('.', 27623), ('in', 14886), ('the', 10955), ('on', 8035)]


## **Building the translation model using pytorch**
<font size="3"> The Encoder-Decoder model is split into a encoder and a decoder, hence the naming. The decoder maps the input sentence into a representation vector of a predefined length. Intuitively speaking, when a person translates a text, they read the text, understand it, then translate it. We mimic this in a neural networks by representing the meaning in this feature vector. This feature vector is then used by the decoder to turn it into text, but a text into a different language. The implementation of the Encoder-decoder is going to split into three models: Encoder, Decoder, Seq2Seq (which uses the Encoder and the decoder model)</font>

<img src="encoder.png" width="600">

<font size="3"> The encoder consists of an embedding layer, that converts a sequence of words (The phrase to be translated) into an embedding of shape (vocab_words, embedding_space) then feeds it sequentially to the LSTM layers. Every LSTM layer has both hidden state and cell state, which keep track of what the LSTM unit remembers. We will return that as the output of the Encoder model. </font>

In [32]:
class Encoder(nn.Module):
  
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p_dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p_dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p_dropout)

    def forward(self, x):

        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding)

        return (hidden, cell)


<font size="3"> The decoder part receives the previous word in the sentence as an input (<SOS\> if it is the first word in the sentence), the LSTM is to be initialized with the cell state and the hidden state of the encoder. The previous word is converted to an embedding, and then fed to the LSTM layer. The returned output is then fed to a dense layer. The output of the dense layer represents the likelihood of each word to be the next word of the translated sentence. Conventionally, we pick the argmax(1).

In [33]:
class Decoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p_dropout):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p_dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p_dropout)
        self.Fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))

        predictions = self.Fc(output)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

<font size="3">**Seq2Seq** is the model that deploys the Encoder and the Decoder model. It takes as input an instances of the encoder and the decoder model. It uses the encoder model to get the latent representation of the input text. After that it loops on the length of the target sequence. In every step of the loop, the decoder is fed the last predicted word, and it inputs the likelihood of every word in the vocabulary of it being the next word. Argmax(1) is used to filter out the word with the heighest likelihood.</font>

<font size="3">*Teacher Forcing* is feeding the decoder the actual previous word, not the predicted previous word; since according to the original paper, it is said to help improve the training.

In [34]:
class Seq2Seq(nn.Module):
  
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [35]:
#Training Hyperparameters
num_epochs = 100
lr = 0.001
batch_size = 64

In [36]:
#Model Hyperparameter
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_decoder = len(english.vocab)
input_size_encoder = len(german.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [37]:
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device= device
)

In [38]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, 
                      hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, 
                      hidden_size, output_size, num_layers, dec_dropout).to(device)

In [39]:
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [40]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [41]:
if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

In [42]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

for epoch in range(num_epochs):
    
    print(f'Epoch [{epoch} / {num_epochs}]')

    checkpoint = {'state_dict': model.state_dict(), optimizer: optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50)

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(inp_data, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        step = step + 1

Epoch [0 / 100]
=> Saving checkpoint
torch.Size([17, 1, 300])
Translated example sentence: 
 ['couple', 'hovering', 'hovering', 'smock', 'smock', 'smock', 'smock', 'smock', 'nap', '2008', '2008', 'daring', 'daring', 'adult', 'adult', 'lavender', 'lavender', 'hotel', 'lavender', 'wraps', 'saw', 'ponchos', 'ponchos', 'bucked', 'chat', 'chat', 'lavender', 'lavender', 'canoe', 'wraps', 'wraps', 'boarding', 'boarding', 'shown', 'shelves', 'shelves', 'lavender', 'crutches', 'crutches', 'sorting', 'sorting', 'sorting', 'taste', 'taste', 'snorkeling', 'helps', 'helps', 'demonstration', 'cords', 'robed']
torch.Size([13, 64, 300])
torch.Size([13, 64, 300])
torch.Size([11, 64, 300])
torch.Size([18, 64, 300])
torch.Size([22, 64, 300])
torch.Size([13, 64, 300])
torch.Size([17, 64, 300])
torch.Size([14, 64, 300])
torch.Size([12, 64, 300])
torch.Size([19, 64, 300])
torch.Size([11, 64, 300])
torch.Size([10, 64, 300])
torch.Size([10, 64, 300])


KeyboardInterrupt: 

In [None]:
translate_sentence(model, "Diese Jungs schauen sich jeden Tag die Nachrichten an", german, english, device, max_length=50)

'.'