In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
%%time
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec

--2022-06-18 16:41:54--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6597238061 (6.1G) [binary/octet-stream]
Saving to: ‘wiki.en.vec’


2022-06-18 16:44:03 (48.8 MB/s) - ‘wiki.en.vec’ saved [6597238061/6597238061]

CPU times: user 1.45 s, sys: 291 ms, total: 1.75 s
Wall time: 2min 9s


In [45]:
%%time
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec

--2022-06-18 16:44:10--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4954492872 (4.6G) [binary/octet-stream]
Saving to: ‘wiki.ru.vec’


2022-06-18 16:45:53 (46.6 MB/s) - ‘wiki.ru.vec’ saved [4954492872/4954492872]

CPU times: user 1.17 s, sys: 222 ms, total: 1.39 s
Wall time: 1min 42s


In [29]:
!pip install torch==1.10.2
!pip install torchtext==0.11.2
!python3 -m spacy download ru_core_news_sm
!python3 -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.3.0/ru_core_news_sm-3.3.0-py3-none-any.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 5.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.0 MB/s 
[38;5;2m✔ D

In [67]:
import os
import math
import time

import spacy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from gensim.models import KeyedVectors

from gru_utils import (
    set_seed,
    train_gru,
    evaluate_gru,
    epoch_time,
    calculate_bleu,
    gru_translate_sentence
)
from gru_net import (
    Attention,
    Encoder,
    Decoder,
    Seq2Seq
)

In [32]:
DATA_PATH = 'data'
MODELS_PATH = 'models'
INPUT_DATA = os.path.join(DATA_PATH, 'data.csv')

BATCH_SIZE = 128

In [33]:
set_seed()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [34]:
spacy_ru = spacy.load('ru_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_ru(text):
    return [tok.text for tok in spacy_ru.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [35]:
SRC = Field(
    tokenize = tokenize_en,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    include_lengths = True
)

TRG = Field(
    tokenize = tokenize_ru,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True
)

In [36]:
dataset = TabularDataset(
    path=INPUT_DATA,
    format='tsv',
    fields=[('src', SRC), ('trg', TRG)]
)

In [37]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])

In [38]:
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

In [39]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=(lambda x : len(x.src)),
    device=device
)

In [46]:
%%time
en_emb = KeyedVectors.load_word2vec_format("wiki.en.vec")
ru_emb = KeyedVectors.load_word2vec_format("wiki.ru.vec")

CPU times: user 1min 30s, sys: 7.44 s, total: 1min 38s
Wall time: 1min 38s


In [47]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

In [48]:
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

In [49]:
model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

In [50]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6803, 300)
    (rnn): GRU(300, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(9324, 300)
    (rnn): GRU(1324, 512)
    (fc_out): Linear(in_features=1836, out_features=9324, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [57]:
en_emb_ordered = []
for token in tqdm(SRC.vocab.itos[4:]):
    try:
        en_emb_ordered.append(en_emb.get_vector(token))
    except KeyError:
        en_emb_ordered.append(np.random.normal(size=ENC_EMB_DIM))

  0%|          | 0/6799 [00:00<?, ?it/s]

In [58]:
ru_emb_ordered = []
for token in tqdm(TRG.vocab.itos[4:]):
    try:
        ru_emb_ordered.append(ru_emb.get_vector(token))
    except KeyError:
        ru_emb_ordered.append(np.random.normal(size=DEC_EMB_DIM))

  0%|          | 0/9320 [00:00<?, ?it/s]

In [59]:
en_emb_ordered = np.array(en_emb_ordered)
ru_emb_ordered = np.array(ru_emb_ordered)

In [60]:
for name, mod in enc.named_modules():
    if name == "embedding":
        mod = nn.Embedding.from_pretrained(torch.FloatTensor(en_emb_ordered))
        
for name, mod in dec.named_modules():
    if name == "embedding":
        mod = nn.Embedding.from_pretrained(torch.FloatTensor(ru_emb_ordered))

In [61]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 28,602,320 trainable parameters


In [62]:
optimizer = optim.Adam(model.parameters())

In [63]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [66]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_gru(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate_gru(model, valid_iterator, criterion)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "./models/pretrained_embeddings.pt")
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 2m 19s
	Train Loss: 4.163 | Train PPL:  64.296
	 Val. Loss: 4.787 |  Val. PPL: 119.940
Epoch: 02 | Time: 2m 18s
	Train Loss: 3.369 | Train PPL:  29.062
	 Val. Loss: 4.534 |  Val. PPL:  93.107
Epoch: 03 | Time: 2m 18s
	Train Loss: 2.897 | Train PPL:  18.127
	 Val. Loss: 4.385 |  Val. PPL:  80.204
Epoch: 04 | Time: 2m 17s
	Train Loss: 2.552 | Train PPL:  12.827
	 Val. Loss: 4.346 |  Val. PPL:  77.171
Epoch: 05 | Time: 2m 18s
	Train Loss: 2.282 | Train PPL:   9.792
	 Val. Loss: 4.339 |  Val. PPL:  76.649
Epoch: 06 | Time: 2m 18s
	Train Loss: 2.066 | Train PPL:   7.890
	 Val. Loss: 4.339 |  Val. PPL:  76.656
Epoch: 07 | Time: 2m 18s
	Train Loss: 1.904 | Train PPL:   6.715
	 Val. Loss: 4.353 |  Val. PPL:  77.702
Epoch: 08 | Time: 2m 18s
	Train Loss: 1.774 | Train PPL:   5.894
	 Val. Loss: 4.424 |  Val. PPL:  83.452
Epoch: 09 | Time: 2m 18s
	Train Loss: 1.659 | Train PPL:   5.255
	 Val. Loss: 4.487 |  Val. PPL:  88.893
Epoch: 10 | Time: 2m 19s
	Train Loss: 1.554 | Train PPL

In [68]:
torch.cuda.empty_cache()

In [69]:
model.load_state_dict(torch.load("./models/pretrained_embeddings.pt"))

<All keys matched successfully>

In [70]:
num_batches = len(test_data.examples) // test_iterator.batch_size
start_time = time.time()
test_loss = evaluate_gru(model, test_iterator, criterion)
elapsed_time = (time.time() - start_time) 
time_per_32_batch = elapsed_time / num_batches / 4

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Inference time if batch_size=32: {time_per_32_batch:.3f} sec')

| Test Loss: 4.312 | Test PPL:  74.565 | Inference time if batch_size=32: 0.042 sec


In [73]:
example_idx = 3

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = guests can prepare their meals in the kitchen with a stove , microwave and fridge .
trg = гости могут приготовить себе еду на кухне с плитой , микроволновой печью и холодильником .


In [74]:
translation, attention = gru_translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = гости могут сами кухне можно пользоваться плитой , микроволновой печью и холодильником . <eos>


In [78]:
example_idx = 6

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = it offers a conservatory , special leisure discounts , and direct access to the chiemsee area 's hiking and bicycle routes .
trg = к услугам гостей зимний сад и скидки на различные варианты проведения досуга . рядом с отелем пролегают пешеходные и велосипедные маршруты по району кимзее .


In [79]:
translation, attention = gru_translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = в территории отеля <unk> , гостям предложат множество <unk> , , а также <unk> , <unk> и <unk> . <eos>


In [80]:
example_idx = 21

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = free private parking is possible on site .
trg = на территории имеется бесплатная частная парковка .


In [81]:
translation, attention = gru_translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = на территории обустроена бесплатная частная парковка . <eos>


In [82]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')

BLEU score = 22.13
