# Playground

In [1]:
import io
from collections import Counter
from pathlib import Path
from typing import List

import pytorch_lightning as pl
import pytorch_lightning.callbacks as plc
import torch
from icecream import ic
from torchtext.data.utils import get_tokenizer
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import FastText, vocab

from zeronmt.models.attention import Attention
from zeronmt.models.decoder import Decoder
from zeronmt.models.encoder import Encoder
from zeronmt.models.seq2seq import Seq2Seq

In [2]:
url_base = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"
train_urls = ("train.de.gz", "train.en.gz")
val_urls = ("val.de.gz", "val.en.gz")
test_urls = ("test_2016_flickr.de.gz", "test_2016_flickr.en.gz")

train_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in train_urls
]
val_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in val_urls
]
test_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in test_urls
]

de_tokenizer = get_tokenizer("basic_english")  # keep it simple
en_tokenizer = get_tokenizer("basic_english")  # keep it simple

In [5]:
# MAPPING_PATH = Path(
#     "/home/maciej/github/bachelor-thesis/project/vecs/le0n8xvt7l/best_mapping.pth"
# )

In [6]:
# # TODO
# mapping = torch.load(MAPPING_PATH)

# cs_vecs = MappedFastTextVectors(language="cs", mapping=None)
# pl_vecs = MappedFastTextVectors(language="pl", mapping=mapping)

In [7]:
class FastTextPretrainedAligned(FastText):
    url_base = (
        "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{}.align.vec"
    )
    # url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.align.vec"

    def __init__(self, language: str, special_toks: List[str], **kwargs) -> None:
        super().__init__(language, **kwargs)

        # prepend specials tokens
        self.itos[0:0] = special_toks

        # hopefully it is not slow :)
        self.stoi = {
            **dict(zip(special_toks, range(len(special_toks)))),
            **{word: i + len(special_toks) for i, word in enumerate(self.stoi)},
        }

        # the vectors for the special tokens here will not be used by the model
        # we set them to zeros so indexing works flawlessly
        vecs_special_toks = torch.zeros(len(special_toks), self.dim)
        self.vectors = torch.cat((vecs_special_toks, self.vectors), dim=0)
        assert len(self.vectors) == len(self.itos)
        assert len(self.vectors) == len(self.stoi)

In [8]:
VOCAB_SIZE = int(1.5e5)  # top 15K words only

In [9]:
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]

en_vecs = FastTextPretrainedAligned(
    language="en", special_toks=specials, max_vectors=VOCAB_SIZE
)
de_vecs = FastTextPretrainedAligned(
    language="de", special_toks=specials, max_vectors=VOCAB_SIZE
)

en_vocab = vocab(en_vecs.stoi, min_freq=0)
de_vocab = vocab(de_vecs.stoi, min_freq=0)

In [10]:
de_vocab.set_default_index(de_vocab["<unk>"])
en_vocab.set_default_index(en_vocab["<unk>"])

In [14]:
ic(de_vecs.stoi["<unk>"])
ic(de_vecs.stoi["<pad>"])
ic(de_vecs.stoi["<bos>"])
ic(de_vecs.stoi["<eos>"])

ic| de_vecs.stoi['<unk>']: 0
ic| de_vecs.stoi['<pad>']: 1
ic| de_vecs.stoi['<bos>']: 2
ic| de_vecs.stoi['<eos>']: 3


3

In [15]:
BATCH_SIZE = 128

# special tokens are prepended, so these indices are the same for both the languages
PAD_IDX = de_vocab["<pad>"]
BOS_IDX = de_vocab["<bos>"]
EOS_IDX = de_vocab["<eos>"]

In [16]:
ic(PAD_IDX)
ic(BOS_IDX)
ic(EOS_IDX)

ic| PAD_IDX: 1
ic| BOS_IDX: 2
ic| EOS_IDX: 3


3

In [18]:
# TODO
# INPUT_DIM = len(cs_vecs)
# OUTPUT_DIM = len(pl_vecs)

In [19]:
INPUT_DIM = len(de_vecs)
OUTPUT_DIM = len(en_vecs)

In [20]:
ic(INPUT_DIM)
ic(OUTPUT_DIM)

ic| INPUT_DIM: 150004
ic| OUTPUT_DIM: 150004


150004

In [22]:
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
def data_process(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []
    for raw_de, raw_en in zip(raw_de_iter, raw_en_iter):
        de_tensor_ = torch.tensor(
            [de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long
        )
        en_tensor_ = torch.tensor(
            [en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long
        )
        data.append((de_tensor_, en_tensor_))
    return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [25]:
enc = Encoder(
    INPUT_DIM, en_vecs, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, PAD_IDX, len(specials)
)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(
    OUTPUT_DIM,
    de_vecs,
    ENC_HID_DIM,
    DEC_HID_DIM,
    DEC_DROPOUT,
    attn,
    PAD_IDX,
    len(specials),
)
model = Seq2Seq(enc, dec, PAD_IDX=PAD_IDX).to(device)

encoder.special_toks_embedding.weight
encoder.pretrained_embedding.weight
encoder.rnn.weight_ih_l0
encoder.rnn.weight_hh_l0
encoder.rnn.bias_ih_l0
encoder.rnn.bias_hh_l0
encoder.rnn.weight_ih_l0_reverse
encoder.rnn.weight_hh_l0_reverse
encoder.rnn.bias_ih_l0_reverse
encoder.rnn.bias_hh_l0_reverse
encoder.fc.weight
encoder.fc.bias
decoder.attention.attn.weight
decoder.attention.attn.bias
decoder.special_toks_embedding.weight
decoder.pretrained_embedding.weight
decoder.rnn.weight_ih_l0
decoder.rnn.weight_hh_l0
decoder.rnn.bias_ih_l0
decoder.rnn.bias_hh_l0
decoder.out.weight
decoder.out.bias


In [26]:
model

Seq2Seq(
  (encoder): Encoder(
    (special_toks_embedding): Embedding(4, 300, padding_idx=1)
    (pretrained_embedding): Embedding(150004, 300)
    (rnn): GRU(300, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (special_toks_embedding): Embedding(4, 300, padding_idx=1)
    (pretrained_embedding): Embedding(150004, 300)
    (rnn): GRU(428, 64)
    (out): Linear(in_features=492, out_features=150004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (criterion): CrossEntropyLoss()
)

In [27]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [28]:
def collate_batch(data_batch):
    de_batch, en_batch = [], []
    for de_item, en_item in data_batch:
        de_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        en_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

In [29]:
train_dl = DataLoader(
    train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dl = DataLoader(
    val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)
test_dl = DataLoader(
    test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [32]:
trainer = pl.Trainer(
    gradient_clip_val=1.0,
    max_epochs=10,
    callbacks=[plc.TQDMProgressBar(refresh_rate=5)],
)
trainer.fit(model, train_dataloaders=[train_dl], val_dataloaders=[valid_dl])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 45.2 M
1 | decoder   | Decoder          | 119 M 
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
74.2 M    Trainable params
90.0 M    Non-trainable params
164 M     Total params
656.808   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 227/227 [4:06:44<00:00, 65.22s/it, v_num=14, train_loss=5.190]  







Epoch 0: 100%|██████████| 227/227 [4:08:30<00:00, 65.68s/it, v_num=14, train_loss=5.260]



Epoch 3:  44%|████▍     | 100/227 [1:47:20<2:16:19, 64.40s/it, v_num=14, train_loss=4.520]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [33]:
de_in = torch.tensor(
    [de_vocab[token] for token in de_tokenizer("<bos> ich liebe kartoffeln <eos>")]
).unsqueeze(1)
en_in = torch.tensor(
    [en_vocab[token] for token in en_tokenizer("<bos> i love potatoes <eos>")]
).unsqueeze(
    1
)  # actually unused
ic(de_in.shape)
ic(de_in)
ic(en_in.shape)
ic(en_in)

ic| de_in.shape: torch.Size([5, 1])
ic| de_in: tensor([[    2],
                   [   47],
                   [  969],
                   [15732],
                   [    3]])
ic| en_in.shape: torch.Size([5, 1])
ic| en_in: tensor([[    2],
                   [   32],
                   [  571],
                   [14391],
                   [    3]])


tensor([[    2],
        [   32],
        [  571],
        [14391],
        [    3]])

In [34]:
en_vocab["<bos>"]

2

In [35]:
ic(torch.tensor(tuple(en_vocab["<bos>"] for _ in de_in[:, 0])).unsqueeze(-1).shape)
ic(torch.tensor(tuple(en_vocab["<bos>"] for _ in de_in[:, 0])).unsqueeze(-1))

ic| torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1).shape: torch.Size([5, 1])
ic| torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1): tensor([[2],
                                                                                       [2],
                                                                                       [2],
                                                                                       [2],
                                                                                       [2]])


tensor([[2],
        [2],
        [2],
        [2],
        [2]])

In [36]:
en_in.shape
ic(en_in)

ic| en_in: tensor([[    2],
                   [   32],
                   [  571],
                   [14391],
                   [    3]])


tensor([[    2],
        [   32],
        [  571],
        [14391],
        [    3]])

In [37]:
output = model(de_in, en_in, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([en_vocab.get_itos()[t] for t in predicted_tokens])

ic| output.shape: torch.Size([5, 1, 150004])
ic| predicted_tokens.shape: torch.Size([5, 1])
ic| predicted_tokens: tensor([[  0],
                              [ 73],
                              [383],
                              [ 38],
                              [ 10]])
ic| [en_vocab.get_itos()[t] for t in predicted_tokens]: ['<unk>', 'two', 'men', 'are', 'in']


['<unk>', 'two', 'men', 'are', 'in']

In [38]:
output = model(
    de_in,
    torch.tensor(tuple(en_vocab["<bos>"] for _ in de_in[:, 0])).unsqueeze(-1),
    teacher_forcing_ratio=0,
)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([en_vocab.get_itos()[t] for t in predicted_tokens])

ic| output.shape: torch.Size([5, 1, 150004])
ic| predicted_tokens.shape: torch.Size([5, 1])
ic| predicted_tokens: tensor([[   0],
                              [  73],
                              [ 383],
                              [  38],
                              [2258]])
ic| [en_vocab.get_itos()[t] for t in predicted_tokens]: ['<unk>', 'two', 'men', 'are', 'standing']


['<unk>', 'two', 'men', 'are', 'standing']

In [39]:
en_vocab.get_itos()[0]

'<unk>'

In [40]:
torch.set_printoptions(threshold=100)

In [41]:
for de_in, en_in in valid_dl:
    ic(de_in.shape)
    ic(en_in.shape)
    de_in = de_in[:, 0].unsqueeze(-1)  # first item in the batch only
    en_in = en_in[:, 0].unsqueeze(-1)  # first item in the batch only
    ic(de_in.shape)
    ic(en_in.shape)
    ic(de_in[:, 0])
    ic(en_in[:, 0])
    break
output = model(de_in, en_in, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens[:, 0])  # get first batch here
ic(
    [de_vocab.get_itos()[t] for t in de_in[:12]]
)  # limit tokens to first 12 for better presentation
ic(
    [en_vocab.get_itos()[t] for t in predicted_tokens[:12]]
)  # limit tokens to first 12 for better presentation

ic| de_in.shape: torch.Size([35, 128])
ic| en_in.shape: torch.Size([30, 128])
ic| de_in.shape: torch.Size([35, 1])
ic| en_in.shape: torch.Size([30, 1])
ic| de_in[:, 0]: tensor([    2,    29,   347,    14,  3996, 14532, 18363,    25,    58, 22585,
                             3,     1,     1,     1,     1,     1,     1,     1,     1,     1,
                             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
                             1,     1,     1,     1,     1])
ic| en_in[:, 0]: tensor([   2,   16,  168,    8,  383,   38, 7395, 4830, 2604,   16, 4453,    3,
                            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
                            1,    1,    1,    1,    1,    1])
ic| output.shape: torch.Size([30, 1, 150004])
ic| predicted_tokens.shape: torch.Size([30, 1])
ic| predicted_tokens[:, 0]: tensor([  0,  16, 168,   8,  16,  16,  16,  16,  16,   5,   3,   5,   3,   5,
                                      3,   5, 

['<unk>', 'a', 'group', 'of', 'a', 'a', 'a', 'a', 'a', '.', '<eos>', '.']