# Machine Translation with Seq2Seq
Based on [Aladdin Persson](https://www.youtube.com/@AladdinPersson)'s [Tutorial](https://www.youtube.com/watch?v=EoGUlvhRYpk)

### Downloads and imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k # German to English dataset
"""
Had a problem with downloading the dataset due to down servers. Had to substitute the urls with the ones below.

urls = ['https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz',
        'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt_task1_test2016.tar.gz']
"""


from torchtext.data import Field, BucketIterator
import numpy as np
import spacy # Tokenizer
import random
from torch.utils.tensorboard.writer import SummaryWriter  # to print to tensorboard

In [2]:
spacy_de = spacy.load('de_core_news_sm') # German tokenizer
spacy_en = spacy.load('en_core_web_sm') # English tokenizer

In [3]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
deutsch = Field(tokenize=tokenize_de, lower=True,
                init_token='<sos>', eos_token='<eos>')

english = Field(tokenize=tokenize_en, lower=True,
                init_token='<sos>', eos_token='<eos>')

In [5]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(deutsch, english))

In [6]:
deutsch.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1, dropout_p=0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)
        # hidden shape: (num_layers, N, hidden_size)
        # cell shape: (num_layers, N, hidden_size)
        
        return hidden, cell

In [8]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size,
                 output_size, num_layers=1, dropout_p=0):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        x = x.unsqueeze(0)
        # x shape: (1, N)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)
        # hidden shape: (num_layers, N, hidden_size)
        # cell shape: (num_layers, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape: (1, N, length_target_vocabulary) to send it to loss function
        predictions = predictions.squeeze(0)
        # predictions shape: (N, length_target_vocabulary)
        
        return predictions, hidden, cell