# Playground

In [None]:
# nasty hack for Colab
![ -n $COLAB_RELEASE_TAG ] && git clone -b madziejm-dev https://github.com/madziejm/0nmt.git
![ -n $COLAB_RELEASE_TAG ] && pip install -r ./0nmt/requirements.txt
try:
  import google.colab
  import sys
  sys.path.insert(0, '/content/0nmt')
except Exception as e:
  print(e)

In [1]:
import io
from collections import Counter
from pathlib import Path
from typing import List

import pytorch_lightning as pl
import pytorch_lightning.callbacks as plc
import torch
from icecream import ic
from torchtext.data.utils import get_tokenizer
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import FastText, vocab

from zeronmt.models.datatypes import DimensionSpec, Language, Vectors
from zeronmt.models.seq2seq import Seq2Seq

In [2]:
url_base = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"
train_urls = ("train.de.gz", "train.en.gz")
val_urls = ("val.de.gz", "val.en.gz")
test_urls = ("test_2016_flickr.de.gz", "test_2016_flickr.en.gz")

train_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in train_urls
]
val_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in val_urls
]
test_filepaths = [
    extract_archive(download_from_url(url_base + url))[0] for url in test_urls
]

tokenizer = get_tokenizer("basic_english")  # keep it simple

In [3]:
# MAPPING_PATH = Path(
#     "/home/maciej/github/bachelor-thesis/project/vecs/le0n8xvt7l/best_mapping.pth"
# )

In [4]:
# # TODO
# mapping = torch.load(MAPPING_PATH)

# cs_vecs = MappedFastTextVectors(language="cs", mapping=None)
# pl_vecs = MappedFastTextVectors(language="pl", mapping=mapping)

`src` means DE.  
`tgt` means ENG.

In [5]:
class FastTextPretrainedAligned(FastText):
    url_base = (
        "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{}.align.vec"
    )
    # url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.align.vec"

    def __init__(self, language: str, special_toks: List[str], **kwargs) -> None:
        super().__init__(language, **kwargs)

        # prepend specials tokens
        self.itos[0:0] = special_toks

        # hopefully it is not slow :)
        self.stoi = {
            **dict(zip(special_toks, range(len(special_toks)))),
            **{word: i + len(special_toks) for i, word in enumerate(self.stoi)},
        }

        # the vectors for the special tokens here will not be used by the model
        # we set them to zeros so indexing works flawlessly
        vecs_special_toks = torch.zeros(len(special_toks), self.dim)
        self.vectors = torch.cat((vecs_special_toks, self.vectors), dim=0)
        assert len(self.vectors) == len(self.itos)
        assert len(self.vectors) == len(self.stoi)

In [6]:
VOCAB_SIZE = int(5e4)  # top 50K words only

In [7]:
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]

tgt_vecs = FastTextPretrainedAligned(
    language="en", special_toks=specials, max_vectors=VOCAB_SIZE
)
src_vecs = FastTextPretrainedAligned(
    language="de", special_toks=specials, max_vectors=VOCAB_SIZE
)

tgt_vocab = vocab(tgt_vecs.stoi, min_freq=0)
src_vocab = vocab(src_vecs.stoi, min_freq=0)

In [8]:
src_vocab.set_default_index(src_vocab["<unk>"])
tgt_vocab.set_default_index(tgt_vocab["<unk>"])

In [9]:
ic(src_vecs.stoi["<unk>"])
ic(src_vecs.stoi["<pad>"])
ic(src_vecs.stoi["<bos>"])
ic(src_vecs.stoi["<eos>"])

ic| src_vecs.stoi["<unk>"]: 0
ic| src_vecs.stoi["<pad>"]: 1
ic| src_vecs.stoi["<bos>"]: 2
ic| src_vecs.stoi["<eos>"]: 3


3

In [10]:
BATCH_SIZE = 128

# special tokens are prepended, so these indices are the same for both the languages
PAD_IDX = src_vocab["<pad>"]
BOS_IDX = src_vocab["<bos>"]
EOS_IDX = src_vocab["<eos>"]

In [11]:
ic(PAD_IDX)
ic(BOS_IDX)
ic(EOS_IDX)

ic| PAD_IDX: 1
ic| BOS_IDX: 2
ic| EOS_IDX: 3


3

In [12]:
# TODO
# INPUT_DIM = len(cs_vecs)
# OUTPUT_DIM = len(pl_vecs)

In [13]:
# INPUT_DIM = len(src_vecs)
# OUTPUT_DIM = len(tgt_vecs)

In [14]:
# ic(INPUT_DIM)
# ic(OUTPUT_DIM)

In [15]:
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
def data_process(filepaths):
    raw_src_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_tgt_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []
    for raw_de, raw_en in zip(raw_src_iter, raw_tgt_iter):
        src_tensor_ = torch.tensor(
            [src_vocab[token] for token in tokenizer(raw_de)], dtype=torch.long
        )
        tgt_tensor_ = torch.tensor(
            [tgt_vocab[token] for token in tokenizer(raw_en)], dtype=torch.long
        )
        data.append((src_tensor_, tgt_tensor_))
    return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [18]:
# enc = Encoder(
#     INPUT_DIM, tgt_vecs, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, PAD_IDX, len(specials)
# )
# attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
# dec = Decoder(
#     OUTPUT_DIM,
#     src_vecs,
#     ENC_HID_DIM,
#     DEC_HID_DIM,
#     DEC_DROPOUT,
#     attn,
#     PAD_IDX,
#     len(specials),
# )

# model = Seq2Seq(
#     INPUT_DIM,
#     PAD_IDX=PAD_IDX
#     ).to(device)
model = Seq2Seq(
    DEC_DROPOUT,
    ENC_DROPOUT,
    DimensionSpec(
        attention=ATTN_DIM,
        dec_hid=DEC_HID_DIM,
        enc_hid=ENC_HID_DIM,
        nspecial_toks=len(specials),
    ),
    PAD_IDX,
    Vectors(src_vecs, tgt_vecs),
)
# src_pretrained_embeddings=src_vecs,
# tgt_pretrained_embeddings=tgt_vecs,

encoder.special_toks_embedding.weight
encoder.rnn.weight_ih_l0
encoder.rnn.weight_hh_l0
encoder.rnn.bias_ih_l0
encoder.rnn.bias_hh_l0
encoder.rnn.weight_ih_l0_reverse
encoder.rnn.weight_hh_l0_reverse
encoder.rnn.bias_ih_l0_reverse
encoder.rnn.bias_hh_l0_reverse
encoder.fc.weight
encoder.fc.bias
decoder.attention.attn.weight
decoder.attention.attn.bias
decoder.special_toks_embedding.weight
decoder.rnn.weight_ih_l0
decoder.rnn.weight_hh_l0
decoder.rnn.bias_ih_l0
decoder.rnn.bias_hh_l0
decoder.output_to_src.weight
decoder.output_to_src.bias
decoder.output_to_tgt.weight
decoder.output_to_tgt.bias


In [19]:
model

Seq2Seq(
  (encoder): Encoder(
    (special_toks_embedding): Embedding(4, 300, padding_idx=1)
    (rnn): GRU(300, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (special_toks_embedding): Embedding(4, 300, padding_idx=1)
    (rnn): GRU(428, 64)
    (output_to_src): Linear(in_features=492, out_features=50004, bias=True)
    (output_to_tgt): Linear(in_features=492, out_features=50004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (criterion): CrossEntropyLoss()
)

In [20]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [21]:
def collate_batch(data_batch):
    src_batch, tgt_batch = [], []
    for src_item, tgt_item in data_batch:
        src_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), src_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        tgt_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), tgt_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [22]:
train_dl = DataLoader(
    train_data,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch,
    num_workers=0,
)
valid_dl = DataLoader(
    val_data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch,
    num_workers=0,
)
test_dl = DataLoader(
    test_data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch,
    num_workers=0,
)

In [23]:
trainer = pl.Trainer(
    gradient_clip_val=1.0,
    max_epochs=10,
    callbacks=[plc.TQDMProgressBar(refresh_rate=5)],
)
trainer.fit(model, train_dataloaders=[train_dl], val_dataloaders=[valid_dl])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 150 K 
1 | decoder   | Decoder          | 49.4 M
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
49.6 M    Trainable params
0         Non-trainable params
49.6 M    Total params
198.206   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:   4%|▍         | 10/227 [09:46<3:32:04, 58.64s/it, v_num=27, train_loss=18.80]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
src_in = torch.tensor(
    [src_vocab[token] for token in tokenizer("<bos> ich liebe kartoffeln <eos>")]
).unsqueeze(1)
tgt_in = torch.tensor(
    [tgt_vocab[token] for token in tokenizer("<bos> i love potatoes <eos>")]
).unsqueeze(
    1
)  # actually unused
ic(src_in.shape)
ic(src_in)
ic(tgt_in.shape)
ic(tgt_in)

In [None]:
ic(torch.tensor(tuple(tgt_vocab["<bos>"] for _ in src_in[:, 0])).unsqueeze(-1).shape)
ic(torch.tensor(tuple(tgt_vocab["<bos>"] for _ in src_in[:, 0])).unsqueeze(-1))

In [None]:
tgt_in.shape
ic(tgt_in)

In [None]:
output = model(src_in, tgt_in, Language.src, Language.tgt, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([tgt_vocab.get_itos()[t] for t in predicted_tokens])

In [None]:
output = model(
    src_in,
    torch.tensor(tuple(tgt_vocab["<bos>"] for _ in src_in[:, 0])).unsqueeze(-1),
    Language.src,
    Language.tgt,
    teacher_forcing_ratio=0,
)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([tgt_vocab.get_itos()[t] for t in predicted_tokens])

In [None]:
for src_in, tgt_in in valid_dl:
    ic(src_in.shape)
    ic(tgt_in.shape)
    src_in = src_in[:, 0].unsqueeze(-1)  # first item in the batch only
    tgt_in = tgt_in[:, 0].unsqueeze(-1)  # first item in the batch only
    ic(src_in.shape)
    ic(tgt_in.shape)
    ic(src_in[:, 0])
    ic(tgt_in[:, 0])
    break
output = model(src_in, tgt_in,  Language.src, Language.tgt, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens[:, 0])  # get first batch here
ic(
    [src_vocab.get_itos()[t] for t in src_in[:12]]
)  # limit tokens to first 12 for better presentation
ic(
    [tgt_vocab.get_itos()[t] for t in predicted_tokens[:12]]
)  # limit tokens to first 12 for better presentation