In [14]:
# Use below line for demo in external colabs
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm
!pip install -q git+https://github.com/nikitakapitan/transformers.git

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch

from transformers.data.token import load_tokenizers
from transformers.data.vocab import load_vocab

from transformers.data.dataloader import create_dataloaders
from transformers.main import make_model
from transformers.output import check_outputs

from google.colab import drive
drive.mount('/content/drive')

%load_ext autoreload
%autoreload 2

In [None]:
!cp -r drive/MyDrive/multi30k_model_final.pt multi30k_model_final.pt

In [2]:
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de=spacy_de, spacy_en=spacy_en)

Finished.
Vocabulary sizes:
len: SRC=8315 TGT=6384


In [3]:
# aka run_model_example

data_setup = {
    'max_padding' : 128,
}

architecture = {
        'src_vocab_len' : len(vocab_src),
        'tgt_vocab_len' : len(vocab_tgt),
        'N' : 6, # loop
        'd_model' : 512, # emb
        'd_ff' : 2048,
        'h' : 8,
        'p_dropout' : 0.1
    }

_, valid_dataloader = create_dataloaders(
        torch.device("cpu"),
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=1,
        is_distributed=False,
    )

model = make_model(
    src_vocab_len=architecture['src_vocab_len'],
    tgt_vocab_len=architecture['tgt_vocab_len'],
    N=architecture['N'],
    d_model=architecture['d_model'],
    d_ff=architecture['d_ff'],
    h=architecture['h'],
    dropout=architecture['p_dropout'],
    )

model.load_state_dict(
    torch.load("multi30k_model_final.pt", map_location=torch.device("cpu"))
)

example_data = check_outputs(
        valid_dataloader, model, vocab_src, vocab_tgt, n_examples=2
    )
print('Done)')

In [5]:
model

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (q_fc): Linear(in_features=512, out_features=512, bias=True)
          (k_fc): Linear(in_features=512, out_features=512, bias=True)
          (v_fc): Linear(in_features=512, out_features=512, bias=True)
          (final_fc): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_fwd): PositionWiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (resconnect): ModuleList(
          (0): ResidualConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): ResidualConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, 

# Break-Down : run_model_example
## Step 1/3 : create validation dataloader

In [21]:
# print tensor shapes
mapa = { 1 : 1, 128 : 'max_padding', 512 : 'd_model',}

In [12]:
from transformers.data.dataloader import create_dataloaders

_, valid_dataloader = create_dataloaders(
        device=torch.device("cpu"),
        vocab_src=vocab_src,
        vocab_tgt=vocab_tgt,
        spacy_de=spacy_de,
        spacy_en=spacy_en,
        batch_size=1,
        max_padding = data_setup['max_padding'],
        is_distributed=False,
    )

## Step 2/3 : create model and load its model state

In [13]:
# outputs.run_model_example step 2/3 : create and load model state

from transformers.main import make_model


model = make_model(len(vocab_src), len(vocab_tgt), N=6) # d_model=512, d_ff=2048, h=8
model.load_state_dict(
        torch.load("multi30k_model_final.pt", map_location=torch.device("cpu"))
    )

<All keys matched successfully>

## Step 3/3 Break-Down : check_outputs

In [22]:
# outputs.run_model_example step 3/3 : check_outputs

from transformers.data.Batch import Batch

n_examples=5
pad_idx = 2
eos_string = "</s>"

results = [()] * n_examples

idx = 0 # example 0 in range(len(n_examples))
b = next(iter(valid_dataloader))
rb = Batch(src=b[0], tgt=b[1], pad=2)

src_tokens = [vocab_src.get_itos()[x] for x in rb.src[0] if x!=pad_idx]
tgt_tokens = [vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x!=pad_idx]

print(f"Source text (Input) {src_tokens}")
print(f"Target Text (Ground Truth) {tgt_tokens}")

Source text (Input) ['<s>', 'Ein', 'Mann', 'im', 'mittleren', 'Alter', 'legt', 'am', 'Knie', 'eines', 'jüngeren', '<unk>', ',', 'der', 'auf', 'einem', '<unk>', 'sitzt', ',', 'einen', '<unk>', 'an', '.', '</s>']
Target Text (Ground Truth) ['<s>', 'A', 'middle', '-', 'aged', 'man', 'is', 'taping', 'up', 'the', 'knee', 'of', 'a', 'younger', 'football', 'player', 'who', 'is', 'sitting', 'on', 'a', '<unk>', 'table', '.', '</s>']


## 3-Substep 1/3 output.greedy_decode

In [23]:
from transformers.helper import following_mask

b
src = rb.src
print('src.shape=', [mapa[e] for e in src.shape])
src_mask = rb.src_mask
print('src_mask.shape=', [mapa[e] for e in src_mask.shape])
max_len = 72
start_symbol = 0

memory = model.encode(src, src_mask)
print('memory.shape=', [mapa[e] for e in memory.shape])

tgt=torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)

i = 0 # in range(72 - 1)
out = model.decode(memory, src_mask, tgt, following_mask(tgt.size(1)).type_as(src.data))
print('out.shape=', [mapa[e] for e in out.shape])

src.shape= [1, 'max_padding']
src_mask.shape= [1, 1, 'max_padding']
memory.shape= [1, 'max_padding', 'd_model']
out.shape= [1, 1, 'd_model']


In [24]:
model

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (q_fc): Linear(in_features=512, out_features=512, bias=True)
          (k_fc): Linear(in_features=512, out_features=512, bias=True)
          (v_fc): Linear(in_features=512, out_features=512, bias=True)
          (final_fc): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_fwd): PositionWiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (resconnect): ModuleList(
          (0): ResidualConnection(
            (norm): LayerNorm()
          )
          (1): ResidualConnection(
            (norm): LayerNorm()
          )
        )
        (norm): LayerNorm()
        (droput): Dropout(p=0.1, inplace=