In [1]:
import torch
from model import Transformer
from data_utils import generate_mask_src, generate_mask_trg
from train import create_dataloader
from tqdm import tqdm

In [2]:
BATCH_SIZE = 48
SEQ_LEN = 32
EPOCHS = 5
PAD_TOKEN_ID = 1

In [3]:
dataloader, valid_iter, en_vocab, de_vocab, en_tokenizer, de_tokenizer = create_dataloader(batch_size=BATCH_SIZE, seq_len=SEQ_LEN)
model = Transformer(seq_len=SEQ_LEN, src_vocab_size=len(de_vocab), trg_vocab_size=len(en_vocab), dropout_p=0.1)

Filtering the dataset. Initial size: 1000000
Removing special characters...
Filtered successfully. Final size: 938171


In [4]:
_ = model.cuda()

In [5]:
optim = torch.optim.Adam(model.parameters(), lr=1e-4, betas=[0.9, 0.98], eps=1e-9)
loss_obj = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_TOKEN_ID, label_smoothing=0.1)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    iterator = tqdm(enumerate(dataloader))
    for i, (trg_seq, src_seq) in iterator:
        optim.zero_grad()

        trg_input_seq = trg_seq[:, :-1]
        trg_label_seq = trg_seq[:, 1:]
        trg_input_seq = trg_input_seq.cuda(); src_seq = src_seq.cuda()
        trg_label_seq = trg_label_seq.cuda()

        src_mask = generate_mask_src(src_seq, PAD_TOKEN_ID)
        trg_mask = generate_mask_trg(trg_input_seq, PAD_TOKEN_ID)

        enc_out, logits = model(src_seq, trg_input_seq, src_mask, trg_mask)
        loss = loss_obj(logits.view(-1, logits.shape[-1]), trg_label_seq.view(-1)) / float(BATCH_SIZE)

        loss.backward()
        optim.step()
        iterator.set_postfix_str(f"Epoch/Iteration {epoch}/{i}. Loss: {loss}")

In [7]:
from datasets import YandexDataset

In [9]:
train_iter, valid_iter, test_iter = YandexDataset('datasets/Yandex').get_iters()

Filtering the dataset. Initial size: 1000000
Removing special characters...
Filtered successfully. Final size: 938171


In [10]:

train_iter_ = iter(valid_iter)

In [11]:
trg, src = next(train_iter_)
trg, src

('a players absence does not always mean his complete inactivity.',
 'отсутствие игрока не всегда означает его полнейшую бездеятельность.')

In [12]:
from decoding import GreedyDecoder

In [13]:
decoder = GreedyDecoder(
    model, 
    lambda sent: [token.text for token in de_tokenizer(sent)], 
    src_vocab=de_vocab, trg_vocab=en_vocab, 
    eos_token_id=3, sos_token_id=2, pad_token_id=PAD_TOKEN_ID, 
    max_seq_length=SEQ_LEN
)

In [14]:
decoder.decode('Я языковая модель, которая была обучена переводить русский на английский.')

'the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the'

In [12]:
from train import calculate_bleu_score

In [14]:
calculate_bleu_score(decoder, [x for x in test_iter][:1000], lambda x: [token.text for token in de_tokenizer(x)], lambda x: [token.text for token in en_tokenizer(x)], max_len=64)

100%|██████████| 1000/1000 [08:08<00:00,  2.05it/s]

BLEU-4 corpus score = 0.23002674009458673, corpus length = 1000.





0.23002674009458673