In [1]:
import torch
from torch import nn
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm

import html
import re


In [2]:
seed = 4488

In [3]:
def split_texts(rows):
    return {
        "en": [text.split("\t")[0] for text in rows["text"]],
        "fr": [text.split("\t")[1] for text in rows["text"]],
    }

In [4]:
from unidecode import unidecode
def clean_text(batch):
    return {k: [unidecode(s) for s in v] for k, v in batch.items()}

In [5]:
def get_dataset(path: str, data_files: str) -> datasets:
    ds = datasets.load_dataset(path=path, data_files=data_files)
    ds = ds.map(split_texts, batched=True).remove_columns("text")
    ds = ds.map(clean_text, batched=True)
    ds = ds["train"].train_test_split(train_size=0.8, seed=seed)
    tvt_ds = ds["train"].train_test_split(train_size=0.8, seed=seed)
    tvt_ds["validation"] = tvt_ds.pop("test")
    tvt_ds["test"] = ds["test"]
    return tvt_ds

In [6]:
dataset = get_dataset(path="./data", data_files="en-fr.txt")

In [7]:
dataset["train"][:5]

{'en': ['Could you speak more slowly?',
  "It makes me really happy that you're here.",
  'We were right.',
  'They fight like cat and dog.',
  "I'm not worried about losing my job."],
 'fr': ['Pouvez-vous parler plus lentement ?',
  'Je me rejouis vraiment que tu sois ici.',
  'Nous eumes raison.',
  'Ils se disputent comme chien et chat.',
  'Je ne suis pas inquiet de perdre mon emploi.']}

In [8]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [9]:
# !python -m spacy download en_core_news_sm
# !pip uninstall en-core-news-sm

# !python -m spacy download fr_core_news_sm
# !pip uninstall fr-core-news-sm

# !pip uninstall de-core-news-sm

In [10]:
en_nlp = spacy.load("en_core_web_sm")
fr_nlp = spacy.load("fr_core_news_sm")

In [11]:
def tokenize_ds(data, en_nlp, fr_nlp, max_length, sos_token, eos_token):
    return {
        "en_tokens": [sos_token] + [token.text.lower() for token in en_nlp.tokenizer(data["en"])][:max_length] + [eos_token], 
        "fr_tokens": [sos_token] + [token.text.lower() for token in fr_nlp.tokenizer(data["fr"])][:max_length] + [eos_token]
    }

In [12]:
sos_token = "<sos>"
eos_token = "<eos>"
max_length = 1_000

In [13]:
fn_kwargs = {
    "en_nlp": en_nlp,
    "fr_nlp": fr_nlp,
    "max_length": max_length,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_ds, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_ds, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_ds, fn_kwargs=fn_kwargs)

In [14]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

fr_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["fr_tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

In [15]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'i', 'you', 'to', 'the', '?']

In [16]:
fr_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'je', 'a', 'de', '?', 'pas']

In [17]:
assert en_vocab[unk_token] == fr_vocab[unk_token]
assert en_vocab[pad_token] == fr_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [18]:
en_vocab.set_default_index(unk_index)
fr_vocab.set_default_index(unk_index)

In [19]:
en_vocab.get_itos()[0]

'<unk>'

In [20]:
en_vocab["The"]

0

In [21]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 137, 546, 1007, 3479]

In [22]:
def numericalize_vocab(vocab: str, en_vocab, fr_vocab):
    return {
        "en_ids": en_vocab.lookup_indices(vocab["en_tokens"]),
        "fr_ids": fr_vocab.lookup_indices(vocab["fr_tokens"])
    }

In [23]:
fn_kwargs = {
    "en_vocab": en_vocab, 
    "fr_vocab": fr_vocab
}
train_data = train_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)

In [24]:
train_data[0]

{'en': 'Could you speak more slowly?',
 'fr': 'Pouvez-vous parler plus lentement ?',
 'en_tokens': ['<sos>',
  'could',
  'you',
  'speak',
  'more',
  'slowly',
  '?',
  '<eos>'],
 'fr_tokens': ['<sos>',
  'pouvez',
  '-vous',
  'parler',
  'plus',
  'lentement',
  '?',
  '<eos>'],
 'en_ids': [2, 75, 6, 201, 95, 1045, 9, 3],
 'fr_ids': [2, 151, 34, 123, 47, 1143, 8, 3]}

In [25]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'could', 'you', 'speak', 'more', 'slowly', '?', '<eos>']

In [26]:
datatype = "torch"
columns = ["en_ids", "fr_ids"]

train_data = train_data.with_format(type=datatype, columns=columns, output_all_columns=True)
valid_data = valid_data.with_format(type=datatype, columns=columns, output_all_columns=True)
test_data = test_data.with_format(type=datatype, columns=columns, output_all_columns=True)


In [27]:
train_data[0]

{'en_ids': tensor([   2,   75,    6,  201,   95, 1045,    9,    3]),
 'fr_ids': tensor([   2,  151,   34,  123,   47, 1143,    8,    3]),
 'en': 'Could you speak more slowly?',
 'fr': 'Pouvez-vous parler plus lentement ?',
 'en_tokens': ['<sos>',
  'could',
  'you',
  'speak',
  'more',
  'slowly',
  '?',
  '<eos>'],
 'fr_tokens': ['<sos>',
  'pouvez',
  '-vous',
  'parler',
  'plus',
  'lentement',
  '?',
  '<eos>']}

In [28]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        return {
            "en_ids": nn.utils.rnn.pad_sequence(
                [example["en_ids"] for example in batch], 
                padding_value=pad_index
            ),
            "fr_ids": nn.utils.rnn.pad_sequence(
                [example["fr_ids"] for example in batch], 
                padding_value=pad_index
            )
        }
    return collate_fn

In [29]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn=get_collate_fn(pad_index)
    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )

In [30]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [31]:
class Encoder(nn.Module):
    def __init__(self, input_vocab: int, embedding_dim: int, hidden_dim: int, num_layers: int, dropout: int):
        super().__init__()
        self.embedding = nn.Embedding(input_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        embedded = self.dropout(self.embedding(input))
        _, (hidden, cell) = self.lstm(embedded) 
        return hidden, cell

In [32]:
class Decoder(nn.Module):
    def __init__(self, output_vocab: int, embedding_dim: int, hidden_dim: int, num_layers: int, dropout: int):
        super().__init__()
        self.embedding = nn.Embedding(output_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, output_vocab)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input: torch.Tensor, hidden: torch.Tensor, cell: torch.Tensor) -> torch.Tensor:
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        return self.fc(output)