In [1]:
%load_ext autoreload
%autoreload 2

import torchtext
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import build_vocab_from_iterator
import io
from pytorch_models_imp.my_transformer import SelfAttention as mySelfAttention
from pytorch_models_imp.my_transformer import TransformerBlock as myTransformerBlock
from pytorch_models_imp.my_transformer import Encoder as myEncoder
from pytorch_models_imp.my_transformer import DecoderBlock as myDecoderBlock
from pytorch_models_imp.my_transformer import Decoder as myDecoder
from pytorch_models_imp.my_transformer import Transformer as myTransformer
from pytorch_models_imp.transformer import SelfAttention, TransformerBlock, Encoder, DecoderBlock, Decoder, Transformer

import matplotlib.pyplot as plt

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

In [2]:
def calculate_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [3]:
train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

In [4]:
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [5]:
def yield_tokens(filepath, tokenizer):
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            yield tokenizer(string_)
            
de_vocab = build_vocab_from_iterator(yield_tokens(train_filepaths[0], de_tokenizer), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
de_vocab.set_default_index(de_vocab["<unk>"])

en_vocab = build_vocab_from_iterator(yield_tokens(train_filepaths[1], en_tokenizer), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
en_vocab.set_default_index(de_vocab["<unk>"])

In [6]:
def data_process(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []
    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                                dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                                dtype=torch.long)
        data.append((de_tensor_, en_tensor_))
    return data

In [7]:
train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [10]:
device = torch.device('cpu')

BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

TARGET_PAD_INDX = en_vocab["<pad>"]

In [11]:
print(f"PAD: {PAD_IDX}, BOS: {BOS_IDX}, EOS: {EOS_IDX}")

PAD: 1, BOS: 2, EOS: 3


In [12]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

In [13]:
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)

In [14]:
de, en = next(iter(train_iter))

In [15]:
de_trans = de.T
en_trans = en.T

In [32]:
e = nn.Embedding(len(de_vocab), 256)
de_trans_emb = e(de_trans)
en_trans_emb = e(en_trans)

In [33]:
SRC_VOCAB_SIZE = len(de_vocab)
TRG_VOCAB_SIZE = len(en_vocab)

SRC_PAD_INDX = de_vocab['<pad>']
TRG_PAD_INDX = en_vocab['<pad>']

NUM_LAYERS = 3
HEADS = 8
EMBED_SIZE = 256
FORWARD_EXPANSION = 3
DROPOUT = 0.1
DEVICE = "cpu"
MAX_LENGTH = 100

### Self attention

In [34]:
my_self_attention = mySelfAttention(EMBED_SIZE, heads=HEADS)
self_attention = SelfAttention(EMBED_SIZE, heads=HEADS)

In [35]:
self_attention_out = self_attention(de_trans_emb, de_trans_emb, de_trans_emb, None)
my_self_attention_out = my_self_attention(de_trans_emb, de_trans_emb, de_trans_emb, None)

In [36]:
print("My max output: {0}. Size of the model: {1}".format(my_self_attention_out.max(), calculate_params(my_self_attention)))
print("Their max output: {0}. Size of the model: {1}".format(self_attention_out.max(), calculate_params(self_attention)))

My max output: 0.6589629650115967. Size of the model: 68960
Their max output: 0.6907479763031006. Size of the model: 68864


### Transformer block

In [37]:
my_trans_block = myTransformerBlock(EMBED_SIZE, HEADS, FORWARD_EXPANSION, DROPOUT)
their_trans_block = TransformerBlock(EMBED_SIZE, HEADS, forward_expansion=FORWARD_EXPANSION, dropout=DROPOUT)

In [38]:
my_out = my_trans_block(de_trans_emb, de_trans_emb, de_trans_emb, None)
their_out = their_trans_block(de_trans_emb, de_trans_emb, de_trans_emb, None)

In [39]:
print("My max output: {0}. Size of the model: {1}".format(my_out.max(), calculate_params(my_trans_block)))
print("Their max output: {0}. Size of the model: {1}".format(their_out.max(), calculate_params(their_trans_block)))

My max output: 5.1221418380737305. Size of the model: 464224
Their max output: 5.242747783660889. Size of the model: 464128


### Encoder

In [40]:
my_encoder = myEncoder(SRC_VOCAB_SIZE, EMBED_SIZE, NUM_LAYERS, HEADS, FORWARD_EXPANSION, DROPOUT, MAX_LENGTH)
their_encoder = Encoder(SRC_VOCAB_SIZE, EMBED_SIZE, NUM_LAYERS, HEADS, DEVICE, FORWARD_EXPANSION, DROPOUT, MAX_LENGTH)

In [41]:
my_out = my_encoder(de_trans, None)
their_out = their_encoder(de_trans, None)

In [42]:
print("My max output: {0}. Size of the model: {1}".format(my_out.max(), calculate_params(my_encoder)))
print("Their max output: {0}. Size of the model: {1}".format(their_out.max(), calculate_params(their_encoder)))

My max output: 5.011844158172607. Size of the model: 6337312
Their max output: 5.962292194366455. Size of the model: 6337024


### Decoder block

In [43]:
my_decoder_block = myDecoderBlock(EMBED_SIZE, HEADS, FORWARD_EXPANSION, DROPOUT)
their_decoder_block = DecoderBlock(EMBED_SIZE, HEADS, FORWARD_EXPANSION, DROPOUT, DEVICE)

In [44]:
my_out = my_decoder_block(de_trans_emb, de_trans_emb, de_trans_emb, None, None)
their_out = their_decoder_block(de_trans_emb, de_trans_emb, de_trans_emb, None, None)

In [45]:
print("My max output: {0}. Shape: {1}. Size of the model: {2}".format(my_out.max(), my_out.shape, calculate_params(my_decoder_block)))
print("Their max output: {0}. Shape: {1}. Size of the model: {2}".format(their_out.max(), my_out.shape, calculate_params(their_decoder_block)))

My max output: 4.791199684143066. Shape: torch.Size([128, 36, 256]). Size of the model: 533696
Their max output: 6.233399868011475. Shape: torch.Size([128, 36, 256]). Size of the model: 533504


### Decoder

In [46]:
my_decoder = myDecoder(TRG_VOCAB_SIZE, EMBED_SIZE, NUM_LAYERS, HEADS, FORWARD_EXPANSION, DROPOUT, MAX_LENGTH)
their_decoder = Decoder(TRG_VOCAB_SIZE, EMBED_SIZE, NUM_LAYERS, HEADS, FORWARD_EXPANSION, DROPOUT, DEVICE, MAX_LENGTH)

In [47]:
my_out = my_decoder(en_trans, en_trans_emb, None, None)
their_out = their_decoder(en_trans, en_trans_emb, None, None)

In [48]:
print("My max output: {0}. Shape: {1}. Size of the model: {2}".format(my_out.max(), my_out.shape, calculate_params(my_decoder)))
print("Their max output: {0}. Shape: {1}. Size of the model: {2}".format(their_out.max(), my_out.shape, calculate_params(their_decoder)))

My max output: 3.164764881134033. Shape: torch.Size([128, 36, 10838]). Size of the model: 7186582
Their max output: 3.1691977977752686. Shape: torch.Size([128, 36, 10838]). Size of the model: 7186006


### Transformer

In [49]:
my_transformer = myTransformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, SRC_PAD_INDX, TRG_PAD_INDX, EMBED_SIZE, NUM_LAYERS, FORWARD_EXPANSION, HEADS, DROPOUT, DEVICE, MAX_LENGTH)
treir_transformer = Transformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, SRC_PAD_INDX, TRG_PAD_INDX, EMBED_SIZE, NUM_LAYERS, FORWARD_EXPANSION, HEADS, DROPOUT, DEVICE, MAX_LENGTH)

In [50]:
my_out = my_transformer(de_trans, en_trans)
their_out = treir_transformer(de_trans, en_trans)

In [51]:
print("My max output: {0}. Shape: {1}. Size of the model: {2}".format(my_out.max(), my_out.shape, calculate_params(my_transformer)))
print("Their max output: {0}. Shape: {1}. Size of the model: {2}".format(their_out.max(), my_out.shape, calculate_params(treir_transformer)))

My max output: 2.9482316970825195. Shape: torch.Size([128, 36, 10838]). Size of the model: 13523894
Their max output: 3.751157760620117. Shape: torch.Size([128, 36, 10838]). Size of the model: 13523030


### Model loading

In [58]:
LEARNING_RATE = 3e-4
NUM_LAYERS = 3
HEADS = 8
EMBED_SIZE = 512
FORWARD_EXPANSION = 4
DROPOUT = 0.1
DEVICE = "cpu"
MAX_LENGTH = 100

transformer = myTransformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, SRC_PAD_INDX, TRG_PAD_INDX, EMBED_SIZE, NUM_LAYERS, FORWARD_EXPANSION, HEADS, DROPOUT, DEVICE, MAX_LENGTH)
optimizer = torch.optim.Adam(transformer.parameters(), LEARNING_RATE)
transformer.eval();

In [61]:
CHECKPOINT_PATH = "checkpoint.pth"
checkpoint = torch.load(CHECKPOINT_PATH, map_location='cpu')
transformer.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
### Forward example

In [62]:
with torch.no_grad():
    out = transformer(de_trans, en_trans[:, :-1])
out = torch.softmax(out, dim=2)
out = out.argmax(dim=2)

In [63]:
example_id = 1
example_de = [de_vocab.lookup_token(idx) for idx in de_trans[example_id]]
example_en = [en_vocab.lookup_token(idx) for idx in en_trans[example_id]]
example_out_en = [en_vocab.lookup_token(idx) for idx in out[example_id]]

In [None]:
### Generation

In [65]:
bos_token = en_vocab.lookup_indices(["<bos>"])
sentence = bos_token
de_input = de_trans[example_id].unsqueeze(0)

In [67]:
for i in range(30):
    sentence_tensor = torch.LongTensor(sentence).unsqueeze(0)
    with torch.no_grad():
        out = transformer(de_input, sentence_tensor)
        
    out = out.argmax(dim=2)
    print(f"STEP {i}: {out}")
    sentence.append(out[0][-1].item())
    

STEP 0: tensor([[20]])
STEP 1: tensor([[ 20, 118]])
STEP 2: tensor([[ 20, 118, 426]])
STEP 3: tensor([[ 20, 118, 426,  48]])
STEP 4: tensor([[ 20, 118, 426,  48,   8]])
STEP 5: tensor([[ 20, 118, 426,  48,   8,   4]])
STEP 6: tensor([[ 20, 118, 426,  48,   8,   4, 326]])
STEP 7: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6]])
STEP 8: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5]])
STEP 9: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3]])
STEP 10: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6]])
STEP 11: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6,   5]])
STEP 12: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6,   5,   3]])
STEP 13: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6,   5,   3,   6]])
STEP 14: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6,   5,   3,   6,
           5]])
STEP 15: tensor([[ 20, 118, 426,  48,   8,   4, 326,   6,   5,   3,   6,   5,   3,   6,
           5, 

In [68]:
[en_vocab.lookup_token(idx) for idx in sentence]

['<bos>',
 'Two',
 'dogs',
 'drink',
 'water',
 'in',
 'a',
 'lake',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n',
 '<eos>',
 '.',
 '\n']