# Playground

In [1]:
import io
from collections import Counter
from pathlib import Path

import pytorch_lightning as pl
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import vocab

from zeronmt.models.attention import Attention
from zeronmt.models.decoder import Decoder
from zeronmt.models.encoder import Encoder
from zeronmt.models.seq2seq import Seq2Seq

In [2]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('basic_english') # keep it simple
en_tokenizer = get_tokenizer('basic_english') # keep it simple

In [3]:
def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [4]:
de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

In [5]:
de_vocab.set_default_index(de_vocab['<unk>'])
en_vocab.set_default_index(en_vocab['<unk>'])

In [6]:
def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [7]:
MAPPING_PATH = Path(
    "/home/maciej/github/bachelor-thesis/project/vecs/le0n8xvt7l/best_mapping.pth"
)

In [10]:
INPUT_DIM = len(de_vocab)
OUTPUT_DIM = len(en_vocab)

In [11]:
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [13]:
BATCH_SIZE = 128

# TODO we use the same idx for the other language, this is okay assuming special tokens are prepended !!!!!!1 (in fact they are)
PAD_IDX = de_vocab["<pad>"]
BOS_IDX = de_vocab["<bos>"]
EOS_IDX = de_vocab["<eos>"]

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, PAD_IDX=PAD_IDX).to(device)

In [15]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18757, 32)
    (rnn): GRU(32, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (embedding): Embedding(10210, 32)
    (rnn): GRU(160, 64)
    (out): Linear(in_features=224, out_features=10210, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (criterion): CrossEntropyLoss()
)

In [16]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [17]:
def collate_batch(data_batch):
    de_batch, en_batch = [], []
    for de_item, en_item in data_batch:
        de_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        en_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

In [18]:
train_dl = DataLoader(
    train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dl = DataLoader(
    val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)
test_dl = DataLoader(
    test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [21]:
PAD_IDX = en_vocab['<pad>']

PAD_IDX

1


In [25]:
from icecream import ic

In [27]:
trainer = pl.Trainer(gradient_clip_val=1.0, max_epochs=10)
trainer.fit(
    model, train_dataloaders=[train_dl], val_dataloaders=[valid_dl]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 646 K 
1 | decoder   | Decoder          | 2.7 M 
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.260    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]



                                                                           

  rank_zero_warn(


Epoch 0:   3%|▎         | 6/227 [00:36<22:16,  6.05s/it, v_num=17, train_loss=9.210]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# TODO use state dict to serialize

In [102]:
# with open('model', 'wb') as f:
#     torch.save(model, f)

In [119]:
# with open('model', 'rb') as f:
#     model = torch.load(f)

In [29]:
de_in = torch.tensor(
    [de_vocab[token] for token in de_tokenizer("<bos> ich liebe kartoffeln <eos>")]
).unsqueeze(1)
en_in = torch.tensor(
    [en_vocab[token] for token in en_tokenizer("<bos> i love potatoes <eos>")]
).unsqueeze(
    1
)  # actually unused
ic(de_in.shape)
ic(de_in)
ic(en_in.shape)
ic(en_in)

ic| de_in.shape: torch.Size([5, 1])
ic| de_in: tensor([[    2],
                   [  175],
                   [ 5735],
                   [12184],
                   [    3]])
ic| en_in.shape: torch.Size([5, 1])
ic| en_in: tensor([[   2],
                   [ 174],
                   [4826],
                   [7497],
                   [   3]])


tensor([[   2],
        [ 174],
        [4826],
        [7497],
        [   3]])

In [104]:
en_vocab['<bos>']

2

In [105]:
ic(torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1).shape)
ic(torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1))

ic| torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1).shape: torch.Size([5, 1])
ic| torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1): tensor([[2],
                                                                                       [2],
                                                                                       [2],
                                                                                       [2],
                                                                                       [2]])


tensor([[2],
        [2],
        [2],
        [2],
        [2]])

In [106]:
en_in.shape
ic(en_in)

ic| en_in: tensor([[   2],
                   [ 174],
                   [4826],
                   [7497],
                   [   3]])


tensor([[   2],
        [ 174],
        [4826],
        [7497],
        [   3]])

In [120]:
output = model(de_in, en_in, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([en_vocab.get_itos()[t] for t in predicted_tokens])

ic| output.shape: torch.Size([5, 1, 10210])
ic| predicted_tokens.shape: torch.Size([5, 1])
ic| predicted_tokens: tensor([[  0],
                              [ 21],
                              [  5],
                              [241],
                              [ 17]])
ic| [en_vocab.get_itos()[t] for t in predicted_tokens]: ['<unk>', 'a', 'young', 'boy', 'in']


['<unk>', 'a', 'young', 'boy', 'in']

In [121]:
output = model(de_in, torch.tensor(tuple(en_vocab['<bos>'] for _ in de_in[:, 0])).unsqueeze(-1), teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens)
ic([en_vocab.get_itos()[t] for t in predicted_tokens])

ic| output.shape: torch.Size([5, 1, 10210])
ic| predicted_tokens.shape: torch.Size([5, 1])
ic| predicted_tokens: tensor([[ 0],
                              [ 4],
                              [ 5],
                              [17],
                              [21]])
ic| [en_vocab.get_itos()[t] for t in predicted_tokens]: ['<unk>', 'two', 'young', 'in', 'a']


['<unk>', 'two', 'young', 'in', 'a']

In [110]:
en_vocab.get_itos()[0]

'<unk>'

In [123]:
torch.set_printoptions(threshold=100)

In [129]:
for de_in, en_in in valid_dl:
    ic(de_in.shape)
    ic(en_in.shape)
    de_in = de_in[:, 0].unsqueeze(-1) # first item in the batch only
    en_in = en_in[:, 0].unsqueeze(-1) # first item in the batch only
    ic(de_in.shape)
    ic(en_in.shape)
    ic(de_in[:, 0])
    ic(en_in[:, 0])
    break
output = model(de_in, en_in, teacher_forcing_ratio=0)
torch.set_printoptions(profile="full")
predicted_tokens = output.argmax(-1)
ic(output.shape)
ic(predicted_tokens.shape)
ic(predicted_tokens[:, 0]) # get first batch here
ic([de_vocab.get_itos()[t] for t in de_in[:12]]) # limit tokens to first 12 for better presentation
ic([en_vocab.get_itos()[t] for t in predicted_tokens[:12]]) # limit tokens to first 12 for better presentation


ic| de_in.shape: torch.Size([35, 128])
ic| en_in.shape: torch.Size([30, 128])
ic| de_in.shape: torch.Size([35, 1])
ic| en_in.shape: torch.Size([30, 1])
ic| de_in[:, 0]: tensor([    2,    48,   127,    86,   427,  2249, 14246,    34,    56,  1115,
                             3,     1,     1,     1,     1,     1,     1,     1,     1,     1,
                             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
                             1,     1,     1,     1,     1])
ic| en_in[:, 0]: tensor([   2,   21,  251,   74,   16,    9, 1100, 1324, 1612,   21,  698,    3,
                            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
                            1,    1,    1,    1,    1,    1])
ic| output.shape: torch.Size([30, 1, 10210])
ic| predicted_tokens.shape: torch.Size([30, 1])
ic| predicted_tokens[:, 0]: tensor([  0,  21, 251,  74, 120,   9,  17,  21,  21,  14,  14,   3,   3,  14,
                                      3,  14,  

['<unk>',
 'a',
 'group',
 'of',
 'people',
 'are',
 'in',
 'a',
 'a',
 '.',
 '.',
 '<eos>']