In [274]:
import torch
from torch import nn
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm

import html
import re


In [223]:
seed = 4488

In [224]:
def split_texts(rows):
    return {
        "en": [text.split("\t")[0] for text in rows["text"]],
        "fr": [text.split("\t")[1] for text in rows["text"]],
    }

In [225]:
from unidecode import unidecode
def clean_text(batch):
    return {k: [unidecode(s) for s in v] for k, v in batch.items()}

In [226]:
def get_dataset(path: str, data_files: str) -> datasets:
    ds = datasets.load_dataset(path=path, data_files=data_files)
    ds = ds.map(split_texts, batched=True).remove_columns("text")
    ds = ds.map(clean_text, batched=True)
    ds = ds["train"].train_test_split(train_size=0.8, seed=seed)
    tvt_ds = ds["train"].train_test_split(train_size=0.8, seed=seed)
    tvt_ds["validation"] = tvt_ds.pop("test")
    tvt_ds["test"] = ds["test"]
    return tvt_ds

In [227]:
dataset = get_dataset(path="./data", data_files="en-fr.txt")

In [228]:
dataset["train"][:5]

{'en': ['Could you speak more slowly?',
  "It makes me really happy that you're here.",
  'We were right.',
  'They fight like cat and dog.',
  "I'm not worried about losing my job."],
 'fr': ['Pouvez-vous parler plus lentement ?',
  'Je me rejouis vraiment que tu sois ici.',
  'Nous eumes raison.',
  'Ils se disputent comme chien et chat.',
  'Je ne suis pas inquiet de perdre mon emploi.']}

In [229]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [230]:
# !python -m spacy download en_core_news_sm
# !pip uninstall en-core-news-sm

# !python -m spacy download fr_core_news_sm
# !pip uninstall fr-core-news-sm

# !pip uninstall de-core-news-sm

In [231]:
en_nlp = spacy.load("en_core_web_sm")
fr_nlp = spacy.load("fr_core_news_sm")

In [232]:
def tokenize_ds(data, en_nlp, fr_nlp, max_length, sos_token, eos_token):
    return {
        "en_tokens": [sos_token] + [token.text.lower() for token in en_nlp.tokenizer(data["en"])][:max_length] + [eos_token], 
        "fr_tokens": [sos_token] + [token.text.lower() for token in fr_nlp.tokenizer(data["fr"])][:max_length] + [eos_token]
    }

In [233]:
sos_token = "<sos>"
eos_token = "<eos>"
max_length = 1_000

In [234]:
fn_kwargs = {
    "en_nlp": en_nlp,
    "fr_nlp": fr_nlp,
    "max_length": max_length,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_ds, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_ds, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_ds, fn_kwargs=fn_kwargs)

In [235]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

fr_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["fr_tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

In [236]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'i', 'you', 'to', 'the', '?']

In [237]:
fr_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'je', 'a', 'de', '?', 'pas']

In [238]:
assert en_vocab[unk_token] == fr_vocab[unk_token]
assert en_vocab[pad_token] == fr_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [239]:
en_vocab.set_default_index(unk_index)
fr_vocab.set_default_index(unk_index)

In [240]:
en_vocab.get_itos()[0]

'<unk>'

In [241]:
en_vocab["The"]

0

In [242]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 137, 546, 1007, 3479]

In [243]:
def numericalize_vocab(vocab: str, en_vocab, fr_vocab):
    return {
        "en_ids": en_vocab.lookup_indices(vocab["en_tokens"]),
        "fr_ids": fr_vocab.lookup_indices(vocab["fr_tokens"])
    }

In [244]:
fn_kwargs = {
    "en_vocab": en_vocab, 
    "fr_vocab": fr_vocab
}
train_data = train_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_vocab, fn_kwargs=fn_kwargs)

In [245]:
train_data[0]

{'en': 'Could you speak more slowly?',
 'fr': 'Pouvez-vous parler plus lentement ?',
 'en_tokens': ['<sos>',
  'could',
  'you',
  'speak',
  'more',
  'slowly',
  '?',
  '<eos>'],
 'fr_tokens': ['<sos>',
  'pouvez',
  '-vous',
  'parler',
  'plus',
  'lentement',
  '?',
  '<eos>'],
 'en_ids': [2, 75, 6, 201, 95, 1045, 9, 3],
 'fr_ids': [2, 151, 34, 123, 47, 1143, 8, 3]}

In [246]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'could', 'you', 'speak', 'more', 'slowly', '?', '<eos>']

In [247]:
datatype = "torch"
columns = ["en_ids", "fr_ids"]

train_data = train_data.with_format(type=datatype, columns=columns, output_all_columns=True)
valid_data = valid_data.with_format(type=datatype, columns=columns, output_all_columns=True)
test_data = test_data.with_format(type=datatype, columns=columns, output_all_columns=True)


In [248]:
train_data[0]

{'en_ids': tensor([   2,   75,    6,  201,   95, 1045,    9,    3]),
 'fr_ids': tensor([   2,  151,   34,  123,   47, 1143,    8,    3]),
 'en': 'Could you speak more slowly?',
 'fr': 'Pouvez-vous parler plus lentement ?',
 'en_tokens': ['<sos>',
  'could',
  'you',
  'speak',
  'more',
  'slowly',
  '?',
  '<eos>'],
 'fr_tokens': ['<sos>',
  'pouvez',
  '-vous',
  'parler',
  'plus',
  'lentement',
  '?',
  '<eos>']}

In [249]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        return {
            "en_ids": nn.utils.rnn.pad_sequence(
                [vocab["en_ids"] for vocab in batch], 
                padding_value=pad_index,
                batch_first=True
            ),
            "fr_ids": nn.utils.rnn.pad_sequence(
                [vocab["fr_ids"] for vocab in batch], 
                padding_value=pad_index,
                batch_first=True
            )
        }
    return collate_fn


In [250]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )

In [251]:
batch_size = 128

train_dataloader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_dataloader = get_data_loader(valid_data, batch_size, pad_index)
test_dataloader = get_data_loader(test_data, batch_size, pad_index)

In [252]:
class Encoder(nn.Module):
    def __init__(self, input_vocab: int, embedding_dim: int, hidden_dim: int, num_layers: int, dropout: int):
        super().__init__()
        self.input_vocab = input_vocab
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        _, (hidden, cell) = self.lstm(embedded) 
        return hidden, cell

In [253]:
class Decoder(nn.Module):
    def __init__(self, output_vocab: int, embedding_dim: int, hidden_dim: int, num_layers: int, dropout: int):
        super().__init__()
        self.output_vocab = output_vocab
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(output_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_vocab)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input: torch.Tensor, hidden: torch.Tensor, cell: torch.Tensor) -> torch.Tensor:
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell)) 
        output = self.fc(output.squeeze(1))
        return output, hidden, cell

In [254]:
enc = Encoder(len(en_vocab), embedding_dim=256, hidden_dim=512, num_layers=4, dropout=0.1)
dec = Decoder(output_vocab=len(fr_vocab), embedding_dim=256, hidden_dim=512, num_layers=4, dropout=0.1)

In [255]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        assert encoder.hidden_dim == decoder.hidden_dim
        assert encoder.num_layers == decoder.num_layers
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src: torch.Tensor, target: torch.Tensor, tf_ratio: float) -> torch.Tensor:
        target_length = target.shape[1]

        outputs = torch.zeros(target.shape[0], target_length, self.decoder.output_vocab).to(self.device)
        hidden, cell = self.encoder(src)
        input = target[:, 0]

        for t in range(1, target_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            input = target[:, t] if random.random() < tf_ratio else output.argmax(1)

        return outputs

In [256]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Seq2Seq(encoder=enc, decoder=dec, device=device)

In [257]:
for x in train_dataloader:
    src = x["en_ids"]
    trg = x["fr_ids"]
    model(src, trg, 0.6)
    break

In [258]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7537, 256)
    (lstm): LSTM(256, 512, num_layers=4, batch_first=True, dropout=0.1)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(11200, 256)
    (lstm): LSTM(256, 512, num_layers=4, batch_first=True, dropout=0.1)
    (fc): Linear(in_features=512, out_features=11200, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [267]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 26,303,680 trainable parameters


In [266]:
loss_fn = nn.CrossEntropyLoss()

In [268]:
x = torch.randn((28, 128, 11200))
x_target = torch.randn((28, 128)).to(torch.long)
x.shape, x_target.shape, x[1:].shape, x_target[1:].shape

(torch.Size([28, 128, 11200]),
 torch.Size([28, 128]),
 torch.Size([27, 128, 11200]),
 torch.Size([27, 128]))

In [269]:
x = x[1:].reshape(-1, x.shape[-1])
x_target = x_target[1:].reshape(-1)
x.shape, x_target.shape

(torch.Size([3456, 11200]), torch.Size([3456]))

In [272]:
loss_fn(x, x_target)

IndexError: Target -2 is out of bounds.

In [270]:
y = torch.randn((128, 28, 11200))
y_target = torch.randn((128, 28)).to(torch.long)
y.shape, y_target.shape, y[:, 1:].shape, y_target[:, 1:].shape

(torch.Size([128, 28, 11200]),
 torch.Size([128, 28]),
 torch.Size([128, 27, 11200]),
 torch.Size([128, 27]))

In [271]:
y = y[:, 1:].reshape(-1, y.shape[-1])
y_target = y_target[:, 1:].reshape(-1)
y.shape, y_target.shape

(torch.Size([3456, 11200]), torch.Size([3456]))

In [273]:
loss_fn(y, y_target)

IndexError: Target -1 is out of bounds.