In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install torch==1.10.2
!pip install torchtext==0.11.2
!python3 -m spacy download ru_core_news_sm
!python3 -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.10.2
  Downloading torch-1.10.2-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.2 MB/s eta 0:00:39tcmalloc: large alloc 1147494400 bytes == 0x3a2ba000 @  0x7eff612ca615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 1.8 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavio

In [3]:
import os
import random
import math
import time

import spacy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from tqdm.notebook import tqdm

from cnn_utils import (
    set_seed,
    train_cnn,
    evaluate_cnn,
    epoch_time,
    cnn_translate_sentence,
    calculate_bleu
)
from cnn import (
    Encoder,
    Decoder,
    Seq2Seq
)

In [4]:
DATA_PATH = 'data'
MODELS_PATH = 'models'
INPUT_DATA = os.path.join(DATA_PATH, 'data.csv')

BATCH_SIZE = 128

In [5]:
set_seed()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [6]:
spacy_ru = spacy.load('ru_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [7]:
def tokenize_ru(text):
    return [tok.text for tok in spacy_ru.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [8]:
SRC = Field(
    tokenize = tokenize_en,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    include_lengths = False,
    batch_first=True
)

TRG = Field(
    tokenize = tokenize_ru,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    batch_first=True
)

In [9]:
dataset = TabularDataset(
    path=INPUT_DATA,
    format='tsv',
    fields=[('src', SRC), ('trg', TRG)]
)

In [10]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])

In [11]:
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

In [12]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False,
    device = device
)

In [13]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 10
DEC_LAYERS = 10
ENC_KERNEL_SIZE = 3
DEC_KERNEL_SIZE = 3
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, device)

model = Seq2Seq(enc, dec).to(device)

In [14]:
optimizer = optim.Adam(model.parameters())

In [15]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [16]:
%%time
N_EPOCHS = 10
CLIP = 0.1

best_valid_loss = float('inf')
checkpoint_path = os.path.join(MODELS_PATH, 'cnn_checkpoint.pt')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_cnn(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate_cnn(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), checkpoint_path)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 2m 11s
	Train Loss: 6.168 | Train PPL: 477.157
	 Val. Loss: 3.247 |  Val. PPL:  25.721
Epoch: 02 | Time: 2m 22s
	Train Loss: 3.104 | Train PPL:  22.295
	 Val. Loss: 2.412 |  Val. PPL:  11.151
Epoch: 03 | Time: 2m 23s
	Train Loss: 2.531 | Train PPL:  12.562
	 Val. Loss: 2.066 |  Val. PPL:   7.893
Epoch: 04 | Time: 2m 24s
	Train Loss: 2.225 | Train PPL:   9.257
	 Val. Loss: 1.864 |  Val. PPL:   6.447
Epoch: 05 | Time: 2m 23s
	Train Loss: 2.017 | Train PPL:   7.518
	 Val. Loss: 1.719 |  Val. PPL:   5.581
Epoch: 06 | Time: 2m 24s
	Train Loss: 1.861 | Train PPL:   6.430
	 Val. Loss: 1.639 |  Val. PPL:   5.148
Epoch: 07 | Time: 2m 24s
	Train Loss: 1.747 | Train PPL:   5.737
	 Val. Loss: 1.577 |  Val. PPL:   4.840
Epoch: 08 | Time: 2m 24s
	Train Loss: 1.658 | Train PPL:   5.246
	 Val. Loss: 1.535 |  Val. PPL:   4.639
Epoch: 09 | Time: 2m 24s
	Train Loss: 1.586 | Train PPL:   4.883
	 Val. Loss: 1.503 |  Val. PPL:   4.497
Epoch: 10 | Time: 2m 23s
	Train Loss: 1.524 | Train PPL

In [17]:
model.load_state_dict(torch.load(checkpoint_path))

<All keys matched successfully>

In [18]:
num_batches = len(test_data.examples) // test_iterator.batch_size
start_time = time.time()
test_loss = evaluate_cnn(model, test_iterator, criterion)
elapsed_time = (time.time() - start_time) 
time_per_32_batch = elapsed_time / num_batches / 4

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Inference time if batch_size=32: {time_per_32_batch:.3f} sec')

| Test Loss: 1.484 | Test PPL:   4.410 | Inference time if batch_size=32: 0.042 sec


In [20]:
example_idx = 3

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = guests can prepare their meals in the kitchen with a stove , microwave and fridge .
trg = гости могут приготовить себе еду на кухне с плитой , микроволновой печью и холодильником .


In [21]:
translation, attention = cnn_translate_sentence(src, SRC, TRG, model, device, max_len=100)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = гости могут готовить еду еду на кухне с плитой , микроволновой печью и холодильником . <eos>


In [26]:
example_idx = 9

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = staff can arrange transfers as well as bus and boat tours .
trg = персонал отеля может организовать трансфер , а также автобусные и лодочные туры .


In [27]:
translation, attention = cnn_translate_sentence(src, SRC, TRG, model, device, max_len=100)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = персонал отеля организуют услуги трансфера , а также организуют экскурсии и экскурсии туры . <eos>


In [28]:
example_idx = 21

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(f'src = {" ".join(src)}')
print(f'trg = {" ".join(trg)}')

src = free private parking is possible on site .
trg = на территории имеется бесплатная частная парковка .


In [29]:
translation, attention = cnn_translate_sentence(src, SRC, TRG, model, device, max_len=100)

print(f'predicted trg = {" ".join(translation)}')

predicted trg = на территории обустроена бесплатная частная парковка . <eos>


In [30]:
model = model.to(device)

In [32]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device, cnn_translate_sentence, max_len=100)

print(f'BLEU score = {bleu_score*100:.2f}')

BLEU score = 27.36
