In [14]:
# Use below line for demo in external colabs
!pip install -q torchdata torchtext spacy==3.2 portalocker altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm
!pip install -q git+https://github.com/nikitakapitan/transflate.git

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import transflate

from transflate.data.token import load_tokenizers
from transflate.data.vocab import load_vocab

from transflate.data.dataloader import create_dataloaders

from transflate.main import make_model
from transflate.output import check_outputs

from torch.utils.data import DataLoader


%load_ext autoreload
%autoreload 2

In [2]:
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de=spacy_de, spacy_en=spacy_en)

Finished.
Vocabulary sizes:
len: SRC=8315 TGT=6384


In [3]:
data_setup = {
    'max_padding' : 128,
}

architecture = {
        'src_vocab_len' : len(vocab_src),
        'tgt_vocab_len' : len(vocab_tgt),
        'N' : 6, # loop
        'd_model' : 512, # emb
        'd_ff' : 2048,
        'h' : 8,
        'p_dropout' : 0.1
    }

model = make_model(
    src_vocab_len=architecture['src_vocab_len'],
    tgt_vocab_len=architecture['tgt_vocab_len'],
    N=architecture['N'],
    d_model=architecture['d_model'],
    d_ff=architecture['d_ff'],
    h=architecture['h'],
    dropout=architecture['p_dropout'],
    )

model.load_state_dict(
    torch.load("../../multi30k_model_final.pt", map_location=torch.device("cpu"))
)



<All keys matched successfully>

In [4]:
# input text
text = "Vier Jungen spielen mit einem großen Hund im Hof"
print('Step.0 Raw text: ', text)
text = [(text, "")]

tokenize_de = lambda text : [tok.text for tok in spacy_de.tokenizer(text)]
tokenize_en = lambda text : [tok.text for tok in spacy_en.tokenizer(text)]

collate_fn = lambda x:  transflate.data.Batch.collate_batch(
            batch=x,
            src_pipeline=tokenize_de,
            tgt_pipeline=tokenize_en,
            src_vocab=vocab_src,
            tgt_vocab=vocab_tgt,
            device=torch.device("cpu"),
            max_padding=data_setup['max_padding'],
            pad_id=vocab_src.get_stoi()["<blank>"],
        )

text_dataloader = DataLoader(text, collate_fn = collate_fn)
print('Step.1 Processed text: \n', list(text_dataloader)[0][0]) 

Step.0 Raw text:  Vier Jungen spielen mit einem großen Hund im Hof
Step.1 Processed text: 
 tensor([[  0, 128,  92,  58,  10,   6,  80,  33,  22, 433,   1,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2]])


In [59]:
# check outputs
pad_idx = 2
eos_string="</s>"

b = next(iter(text_dataloader))
rb = transflate.data.Batch.Batch(src=b[0], tgt=b[1], pad=pad_idx)

model_out = transflate.output.greedy_decode(model, rb.src, rb.src_mask, max_len=72, start_symbol=0)[0]
model_txt = (" ".join([vocab_tgt.get_itos()[x] for x in model_out if x!= pad_idx]).split(eos_string, 1)[0])

print('Model output: ', model_txt) # '<s> Four boys are playing with a large dog in the yard . </s>'


Model output:  <s> Four boys playing with a large brown dog in the yard . 


### Break-Down : Input Preprocessing

In [6]:
prompt = "Vier Jungen spielen mit einem großen Hund im Hof"
print(f"Step.0 {prompt=}")
print('*-*-* Start DATA part *-*-*')
prompt_tokens = [token.text for token in spacy_de.tokenizer(prompt)]
print(f"Step.1 {prompt_tokens=}")
prompt_stoi = vocab_src(prompt_tokens)
print(f"Step.2 {prompt_stoi=}")

bs_id = torch.tensor([0])  #0 index for <s>  
eos_id = torch.tensor([1]) #1 index for </s>

prompt = torch.cat([bs_id, torch.tensor(prompt_stoi), eos_id])
print(f"Step.3 {prompt=}")

prompt = [torch.nn.functional.pad(
    input=prompt, 
    pad=(0, data_setup['max_padding'] - len(prompt) ),
    value=vocab_src.get_stoi()["<blank>"])]
print(f"Step.4 {prompt[0][:42]=}")


prompt = torch.stack(prompt) # add batch dimension
print(prompt.shape)
dl = DataLoader(prompt)
print('*-*-* End DATA part *-*-*')

Step.0 text='Vier Jungen spielen mit einem großen Hund im Hof'
*-*-* Start DATA part *-*-*
Step.1 text=['Vier', 'Jungen', 'spielen', 'mit', 'einem', 'großen', 'Hund', 'im', 'Hof']
Step.2 text=[128, 92, 58, 10, 6, 80, 33, 22, 433]
Step.3 text=tensor([  0, 128,  92,  58,  10,   6,  80,  33,  22, 433,   1])
Step.4 text[0][:42]=tensor([  0, 128,  92,  58,  10,   6,  80,  33,  22, 433,   1,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2])
torch.Size([1, 128])
*-*-* End DATA part *-*-*


### Break-Down : Encoder

In [14]:
src = next(iter(dl))
src_mask = torch.ones_like(text).unsqueeze(0)
print(f"Step.0 Input Tensor {src.shape=}")

src_emb = torch.nn.Embedding(architecture['src_vocab_len'], architecture['d_model'])
src = src_emb(src)
print(f"Step.1 Embedding {src.shape=}")

src_pos_enc = transflate.PositionalEncoding.PositionalEncoding(d_model=architecture['d_model'], dropout=0.1)
src = src_pos_enc(src)
print(f"Step.2 Positional Encoder {src.shape=}")

residual_src1 = src.clone()
print(f"Step.3 Residual before src-self-attn {residual_src1.shape=}")

########## Source Multi-Headed SELF Attention #######

# Query-Key-Value
n_heads = 8
d_head = architecture['d_model'] // n_heads

q_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
k_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
v_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
final_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
attn_dropuot = torch.nn.Dropout(p=0.1)

attn_from = src
attn_to = src # src self-attn
value = src
mask = src_mask

query = q_fc(attn_from)
key = k_fc(attn_to)
value = v_fc(value)
print(f"Step.4.1 Query-Key-Value {query.shape=} {key.shape=} {value.shape=}")

n_batches = src.size(0)
n_tokens = src.size(1)

# split to n_heads 
query = query.view(n_batches, n_tokens, n_heads, d_head) .transpose(1, 2)
key = key.view(n_batches, n_tokens, n_heads, d_head).transpose(1, 2)
value = value.view(n_batches, n_tokens, n_heads, d_head).transpose(1, 2)
print(f"Step.4.2 Split to {n_heads} heads\n \t{query.shape=}\n \t{key.shape=}\n \t{value.shape=}")

# Attention
key_transpose = key.transpose(-2, -1)
scores = torch.matmul(query, key_transpose) / (d_head**0.5)
scores = scores.masked_fill(mask, -1e9)
p_attn = scores.softmax(dim=-1)
# p_attn = attn_dropuot(p_attn)

headed_context = torch.matmul(p_attn, value)

context = headed_context.transpose(1,2).contiguous().view(n_batches, n_tokens, n_heads * d_head)

src = final_fc(context)

print(f"Step.4.3 Attention \n \t{key_transpose.shape=}\n \t{scores.shape=}\n \t{p_attn.shape=}\n \
    \t{headed_context.shape=}\n \t{context.shape=}\n \t{src.shape=}")


norm1 = transflate.LayerNorm.LayerNorm(architecture['d_model'])
src = norm1(src)
print(f"Step.5 Layer Norm {src.shape=}")

src = residual_src1 + src    # end residual 1
residual_src2 = src.clone()  # start residual 2
print(f"Step.6 Residual before FeedFwd {residual_src2.shape=}")


# Feed Forward
w_1 = torch.nn.Linear(architecture['d_model'], architecture['d_ff'])
w_2 = torch.nn.Linear(architecture['d_ff'], architecture['d_model'])
fc_dropuot = torch.nn.Dropout(p=0.1)

fc1 = w_1(src).relu() 
# src = fc_dropuot(src)
src = w_2(fc1)
print(f"Step.7 Feed Forward {src.shape=}")

norm2 = transflate.LayerNorm.LayerNorm(architecture['d_model'])
src = norm2(src)
print(f"Step.8 Layer Norm {src.shape=}")

src = residual_src2 + src    # end residual 1

for _ in range(6): # N layers
    src = src

memory = src.clone()
print(f"Step.9 Memory output: {memory.shape=}")



Step.0 Input Tensor src.shape=torch.Size([1, 128])
Step.1 Embedding src.shape=torch.Size([1, 128, 512])
Step.2 Positional Encoder src.shape=torch.Size([1, 128, 512])
Step.3 Layer Norm residual_src1.shape=torch.Size([1, 128, 512])
Step.4.1 Query-Key-Value query.shape=torch.Size([1, 128, 512]) key.shape=torch.Size([1, 128, 512]) value.shape=torch.Size([1, 128, 512])
Step.4.2 Split to 8 heads
 	query.shape=torch.Size([1, 8, 128, 64])
 	key.shape=torch.Size([1, 8, 128, 64])
 	value.shape=torch.Size([1, 8, 128, 64])
Step.4.3 Attention 
 	key_transpose.shape=torch.Size([1, 8, 64, 128])
 	scores.shape=torch.Size([1, 8, 128, 128])
 	p_attn.shape=torch.Size([1, 8, 128, 128])
     	headed_context.shape=torch.Size([1, 8, 128, 64])
 	context.shape=torch.Size([1, 128, 512])
 	src.shape=torch.Size([1, 128, 512])
Step.5 Layer Norm src.shape=torch.Size([1, 128, 512])
Step.6 Layer Norm residual_src2.shape=torch.Size([1, 128, 512])
Step.7 Layer Norm src.shape=torch.Size([1, 128, 512])
Step.8 Layer Norm 

### Break-Down : Decoder

In [62]:
from transflate.helper import following_mask

tgt_seq = torch.zeros(1, 1).type(torch.LongTensor) # if not type - nn.Embedding error

tgt = tgt_seq.clone()
tgt_mask = following_mask(tgt.size(1)).type_as(src.data)
print(f"Step.10 Initialize target seq: {tgt.shape=}")

tgt_emb = torch.nn.Embedding(architecture['tgt_vocab_len'], architecture['d_model'])
tgt = tgt_emb(tgt)
print(f"Step.11 Tatget Embedding: {tgt.shape=}")

tgt_pos_enc = transflate.PositionalEncoding.PositionalEncoding(d_model=architecture['d_model'], dropout=0.1)
tgt = tgt_pos_enc(tgt)
print(f"Step.12 Positional Encoder {tgt.shape=}")

residual_tgt1 = tgt.clone()
print(f"Step.13 Residual before tgt-self-attn {residual_tgt1.shape=}")

######## Target Multi-Headed SELF Attention ##########

# Query-Key-Value
n_heads = 8
d_head = architecture['d_model'] // n_heads

q_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
k_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
v_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
final_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
attn_dropuot = torch.nn.Dropout(p=0.1)

attn_from = tgt
attn_to = tgt # tgt self-attn
value = tgt
mask = tgt_mask

query = q_fc(attn_from)
key = k_fc(attn_to)
value = v_fc(value)
print(f"Step.14.1 Query-Key-Value {query.shape=} {key.shape=} {value.shape=}")

n_batches = tgt.size(0)
n_tokens = tgt.size(1)

# split to n_heads 
query = query.view(n_batches, n_tokens, n_heads, d_head) .transpose(1, 2)
key = key.view(n_batches, n_tokens, n_heads, d_head).transpose(1, 2)
value = value.view(n_batches, n_tokens, n_heads, d_head).transpose(1, 2)
print(f"Step.14.2 Split to {n_heads} heads\n \t{query.shape=}\n \t{key.shape=}\n \t{value.shape=}")

# Attention
key_transpose = key.transpose(-2, -1)
scores = torch.matmul(query, key_transpose) / (d_head**0.5)
scores = scores.masked_fill(mask, -1e9)
p_attn = scores.softmax(dim=-1)
# p_attn = attn_dropuot(p_attn)

headed_context = torch.matmul(p_attn, value)

context = headed_context.transpose(1,2).contiguous().view(n_batches, n_tokens, n_heads * d_head)

tgt = final_fc(context)

print(f"Step.14.3 Attention \n \t{key_transpose.shape=}\n \t{scores.shape=}\n \t{p_attn.shape=}\n \
    \t{headed_context.shape=}\n \t{context.shape=}\n \t{tgt.shape=}")

norm1 = transflate.LayerNorm.LayerNorm(architecture['d_model'])
tgt = norm1(tgt)
print(f"Step.15 Layer Norm {tgt.shape=}")

tgt = residual_tgt1 + tgt    # end residual 1
residual_tgt2 = tgt.clone()  # start residual 2
print(f"Step.16 Residual before Source-attn {residual_tgt2.shape=}")

######## Target Multi-Headed SOURCE Attention ##########

# Query-Key-Value
n_heads = 8
d_head = architecture['d_model'] // n_heads

q_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
k_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
v_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
final_fc = torch.nn.Linear(architecture['d_model'], architecture['d_model'])
attn_dropuot = torch.nn.Dropout(p=0.1)

attn_from = tgt
attn_to = memory # tgt self-attn
value = memory   # use source information to decode
mask = tgt_mask

query = q_fc(attn_from)
key = k_fc(attn_to)
value = v_fc(value)
print(f"Step.17.1 Query-Key-Value {query.shape=} {key.shape=} {value.shape=}")

n_batches = tgt.size(0)
n_tgt_tokens = tgt.size(1)
n_src_tokens = memory.size(1)

# split to n_heads 
query = query.view(n_batches, n_tgt_tokens, n_heads, d_head) .transpose(1, 2)
key = key.view(n_batches, n_src_tokens, n_heads, d_head).transpose(1, 2)
value = value.view(n_batches, n_src_tokens, n_heads, d_head).transpose(1, 2)
print(f"Step.17.2 Split to {n_heads} heads\n \t{query.shape=}\n \t{key.shape=}\n \t{value.shape=}")

# Attention
key_transpose = key.transpose(-2, -1)
scores = torch.matmul(query, key_transpose) / (d_head**0.5)
scores = scores.masked_fill(mask, -1e9)
p_attn = scores.softmax(dim=-1)
# p_attn = attn_dropuot(p_attn)

headed_context = torch.matmul(p_attn, value)

context = headed_context.transpose(1,2).contiguous().view(n_batches, n_tokens, n_heads * d_head)

tgt = final_fc(context)

print(f"Step.17.3 Attention \n \t{key_transpose.shape=}\n \t{scores.shape=}\n \t{p_attn.shape=}\n \
    \t{headed_context.shape=}\n \t{context.shape=}\n \t{tgt.shape=}")

norm2 = transflate.LayerNorm.LayerNorm(architecture['d_model'])
tgt = norm2(tgt)
print(f"Step.18 Layer Norm {tgt.shape=}")

tgt = residual_tgt2 + tgt    # end residual 2
residual_tgt3 = tgt.clone()  # start residual 3
print(f"Step.19 Residual before FeedFwd {residual_tgt3.shape=}")

# Feed Forward
w_1 = torch.nn.Linear(architecture['d_model'], architecture['d_ff'])
w_2 = torch.nn.Linear(architecture['d_ff'], architecture['d_model'])
fc_dropuot = torch.nn.Dropout(p=0.1)

fc1 = w_1(tgt).relu() 
# src = fc_dropuot(src)
tgt = w_2(fc1)
print(f"Step.20 Feed Forward {tgt.shape=}")

norm3 = transflate.LayerNorm.LayerNorm(architecture['d_model'])
tgt = norm3(tgt)
print(f"Step.21 Layer Norm {tgt.shape=}")

tgt = residual_tgt3 + tgt    # end residual 3

print(f"Step.22 First Decoder Output: {tgt.shape=}")

Step.10 Initialize target seq: tgt.shape=torch.Size([1, 1])
Step.11 Tatget Embedding: tgt.shape=torch.Size([1, 1, 512])
Step.12 Positional Encoder tgt.shape=torch.Size([1, 1, 512])
Step.13 Residual before tgt-self-attn residual_tgt1.shape=torch.Size([1, 1, 512])
Step.14.1 Query-Key-Value query.shape=torch.Size([1, 1, 512]) key.shape=torch.Size([1, 1, 512]) value.shape=torch.Size([1, 1, 512])
Step.14.2 Split to 8 heads
 	query.shape=torch.Size([1, 8, 1, 64])
 	key.shape=torch.Size([1, 8, 1, 64])
 	value.shape=torch.Size([1, 8, 1, 64])
Step.14.3 Attention 
 	key_transpose.shape=torch.Size([1, 8, 64, 1])
 	scores.shape=torch.Size([1, 8, 1, 1])
 	p_attn.shape=torch.Size([1, 8, 1, 1])
     	headed_context.shape=torch.Size([1, 8, 1, 64])
 	context.shape=torch.Size([1, 1, 512])
 	tgt.shape=torch.Size([1, 1, 512])
Step.15 Layer Norm tgt.shape=torch.Size([1, 1, 512])
Step.16 Residual before Source-attn residual_tgt2.shape=torch.Size([1, 1, 512])
Step.17.1 Query-Key-Value query.shape=torch.Size(

### Break-Down: Text Generation

In [63]:
out = tgt

# Generator
projection = torch.nn.Linear(architecture['d_model'], architecture['tgt_vocab_len'])
distrib = projection(out[:, -1])
prob = torch.nn.functional.log_softmax(distrib, dim=-1)

next_word = torch.argmax(prob, dim=1).unsqueeze(0)
tgt_seq=torch.cat([tgt_seq, next_word], dim=1)

print(f"GENERATION.\n \t Input tensor shape: {out.shape=}\n \t Projection: {distrib.shape=}\n \
\t Probabilities: {prob.shape=}\n \t Next word: {next_word.shape=}\n ")

MAX_LEN = 10
print(f'Generation loop:')
for _ in range(2, MAX_LEN): # Decoder Loop
    tgt_mask = following_mask(tgt.size(1)).type_as(tgt.data)
    tgt = tgt

    projection = torch.nn.Linear(architecture['d_model'], architecture['tgt_vocab_len'])
    prob = torch.nn.functional.log_softmax(projection(tgt[:, -1]), dim=-1)
    next_word = torch.argmax(prob, dim=1).unsqueeze(0)

    tgt_seq=torch.cat([tgt_seq, next_word], dim=1)
    print(f'\t \t Iter {_} : {tgt_seq.shape=}')

tgt_seq = tgt_seq[0] # un-batch result
model_txt = (" ".join([vocab_tgt.get_itos()[x] for x in tgt_seq if x!= pad_idx]).split(eos_string, 1)[0])

print(f'\nFINAL TGT OUTPUT: {tgt_seq}')
print(f'GENERATED TEXT: {model_txt}')


GENERATION.
 	 Input tensor shape: out.shape=torch.Size([1, 1, 512])
 	 Projection: distrib.shape=torch.Size([1, 6384])
 	 Probabilities: prob.shape=torch.Size([1, 6384])
 	 Next word: next_word.shape=torch.Size([1, 1])
 
Generation loop:
	 	 Iter 2 : tgt_seq.shape=torch.Size([1, 3])
	 	 Iter 3 : tgt_seq.shape=torch.Size([1, 4])
	 	 Iter 4 : tgt_seq.shape=torch.Size([1, 5])
	 	 Iter 5 : tgt_seq.shape=torch.Size([1, 6])
	 	 Iter 6 : tgt_seq.shape=torch.Size([1, 7])
	 	 Iter 7 : tgt_seq.shape=torch.Size([1, 8])
	 	 Iter 8 : tgt_seq.shape=torch.Size([1, 9])
	 	 Iter 9 : tgt_seq.shape=torch.Size([1, 10])

FINAL TGT OUTPUT: tensor([   0, 1968, 2321, 1992, 4355, 2078,  104, 5347, 2222,  914])
GENERATED TEXT: <s> backdrop power design expensive Band 's coated we formal
