### Seq2Seq for Neural Machine Tranlation (NMT)

Heavily based on PyTorch's tutorial on torchtext and github repo bentrevett/pytorch-seq2seq:
- https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html
- https://github.com/bentrevett/pytorch-seq2seq/tree/master

In [None]:
import io
import random

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from typing import Tuple
from collections import Counter

from torch import Tensor
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torchtext.utils import download_from_url, extract_archive

### Load data

Download French and English sentence pairs.

In [None]:
url_base = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"
train_urls = ("train.fr.gz", "train.en.gz")
val_urls = ("val.fr.gz", "val.en.gz")
test_urls = ("test_2016_flickr.fr.gz", "test_2016_flickr.en.gz")

train_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in train_urls
]
val_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in val_urls
]
test_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in test_urls
]

Samples sentences for both languages

In [None]:
with open(train_filepaths[0], "r") as f:
    out = f.readlines()

print("-- French --")
print(out[:5])

with open(train_filepaths[1], "r") as f:
    out = f.readlines()

print("-- English -- ")
print(out[:5])

Build vocabulary. First loadi a spacy pretrained tokenizer for each language, split each sentence in its corresponing tokens and counting token frequency. 

In [None]:
fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")


def build_vocab(filepath, tokenizer, min_freq=2):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))

    return vocab(
        counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"], min_freq=min_freq
    )


fr_vocab = build_vocab(train_filepaths[0], fr_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

# Set default token to <unk> if a new token is not in vocab
fr_vocab.set_default_index(fr_vocab["<unk>"])
en_vocab.set_default_index(en_vocab["<unk>"])

print(f'French, word to index mapping: fenêtre -> {fr_vocab["fenêtre"]}')
print(f'English, word to index mapping: window -> {en_vocab["window"]}')

Tokenize each sentence and wrap it in a Tensor.

In [None]:
def data_process(filepaths):
    raw_fr_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []
    for raw_fr, raw_en in zip(raw_fr_iter, raw_en_iter):
        fr_tensor_ = torch.tensor(
            [fr_vocab[token] for token in fr_tokenizer(raw_fr)], dtype=torch.long
        )
        en_tensor_ = torch.tensor(
            [en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long
        )
        data.append((fr_tensor_, en_tensor_))
    return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

Create dataset from tokenized samples

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 128
PAD_IDX = fr_vocab["<pad>"]
BOS_IDX = fr_vocab["<bos>"]
EOS_IDX = fr_vocab["<eos>"]


def generate_batch(data_batch):
    fr_batch, en_batch = [], []
    for fr_item, en_item in data_batch:
        fr_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), fr_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        en_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return fr_batch, en_batch


train_iter = DataLoader(
    train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch
)
valid_iter = DataLoader(
    val_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch
)
test_iter = DataLoader(
    test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch
)