In [None]:
!pip install torchtext==0.4.0
!pip install spacy==3.1.0
!pip install nltk

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.8 MB/s  eta 0:00:01
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully installed torchtext-0.4.0
Collecting spacy==3.1.0
  Downloading spacy-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 11.6 MB/s 
[?25hCollecting thinc<8.1.0,>=8.0.7
  Downloading thinc-8.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (621 kB)
[K     |████████████████████████████████| 621 kB 67.1 MB/s 
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)
[K     |████████████████████████████████| 456 kB 69.5 MB/s 
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-

In [18]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
spacy.cli.download("ru_core_news_lg")

Collecting en-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.1.0/en_core_web_lg-3.1.0-py3-none-any.whl (777.1 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Collecting ru-core-news-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.1.0/ru_core_news_lg-3.1.0-py3-none-any.whl (514.2 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [189]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import random_split
from torchtext.data import Field, BucketIterator, TabularDataset, interleave_keys

import spacy
import random
import time
import math
import csv

BATCH_SIZE = 8

nlp_en = spacy.load('en_core_web_lg')
nlp_ru = spacy.load('ru_core_news_lg')


def tokenizer(text, nlp):
    return [token.text for token in nlp.tokenizer(text)]

def tokenizer_ru(text):
    return tokenizer(text, nlp_ru)

def tokenizer_en(text):
    return list(reversed(tokenizer(text, nlp_en)))

src = Field(
    init_token='<sos>',
    eos_token='<eos>',
    tokenize=tokenizer_ru,
    lower=True,
)

trg = Field(
    init_token='<sos>',
    eos_token='<eos>',
    tokenize=tokenizer_en,
    lower=True,
   # is_target=True,
)

In [190]:
dataset = TabularDataset(
    path='./drive/MyDrive/data/translation.csv', 
    format='csv', 
    fields=[('ru', src),('en', trg)],
    csv_reader_params={'delimiter': '|'})

In [191]:
src.build_vocab(dataset, min_freq = 2)
trg.build_vocab(dataset, min_freq = 2)

In [192]:
train_dataset, test_dataset, valid_dataset, check_dataset = random_split(
    dataset, lengths=[220536,63028,31514, 64],
    generator=torch.Generator().manual_seed(42))
train_dataset.fields = dataset.fields
test_dataset.fields = dataset.fields
valid_dataset.fields = dataset.fields
check_dataset.fields = dataset.fields

In [193]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [194]:
train_iterator, test_iterator, valid_iterator, check_iterator = BucketIterator.splits(
    datasets=(train_dataset, test_dataset, valid_dataset, check_dataset),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: interleave_keys(len(x.ru), len(x.en)),
    shuffle=True,
    device=device)

In [195]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=src.vocab['<pad>'])
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        
    def forward(self, src):
        embedded = self.embedding(src)
        _, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [196]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim) 
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)  
        self.out = nn.Linear(hid_dim, output_dim)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0) 
        embedded = self.embedding(input)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        return self.out(output.squeeze(0)), hidden, cell

In [197]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src = None, trg = None, teacher_forcing_ratio = 0.5):
        if trg is None:
            batch_size = src.shape[1]
            max_len = src.shape[0]
        else:
            batch_size = trg.shape[1]
            max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(device)
    
        hidden, cell = self.encoder(src)
        
        input = trg[0,:] # <sos> index
        
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.max(1)[1]
            input = top1 if random.random() > teacher_forcing_ratio else trg[t]
        
        return outputs

In [198]:
INPUT_DIM = len(src.vocab)
OUTPUT_DIM = len(trg.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec).to(device)

In [199]:
def init_weights(m):
    for name, params in m.named_parameters():
        torch.nn.init.uniform_(params, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(94691, 256, padding_idx=1)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(76275, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=76275, bias=True)
  )
)

In [200]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 90,252,787 trainable parameters


In [201]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = trg.vocab['<pad>'])

In [202]:
from nltk.translate.bleu_score import sentence_bleu

def to_trg_sentence(t):
    return [[trg.vocab.itos[i] for i in row] for row in t]

def word_index_from_logits(logits):
    return logits.topk(k=1).indices

def reshape_word_index(target, batch_size):
    sent_len = int(target.shape[0] / batch_size)
    return torch.transpose(target.reshape((batch_size, sent_len)), 0, 1)

def bleu_score(logits, targets, batch_size):
    candidate_sentences = to_trg_sentence(
        reshape_word_index(word_index_from_logits(logits), batch_size))
    reference_sentences = to_trg_sentence(
        reshape_word_index(targets, batch_size)
    )
    bleu_sent_scores = []
    for i, candidate_sentence in enumerate(candidate_sentences):
        bleu_sent_scores.append(sentence_bleu([reference_sentences[i]], candidate_sentence,weights=(0.33,0.33,0.33)))
    return bleu_sent_scores

In [205]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    epoch_bleu = 0

    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.ru
            trg = batch.en

            output = model(src, trg, 0)
            batch_size = output.shape[1]


            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            epoch_bleu += sum(bleu_score(output, trg, batch_size))
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), epoch_bleu / len(iterator)

In [206]:
from tqdm import tqdm

PLOT_STEP = 0

def train(model, iterator, optimizer, criterion, clip):
    global PLOT_STEP
    model.train()
    
    epoch_loss = 0
    epoch_history = []
    for i, batch in enumerate(tqdm(iterator)):
        
        src = batch.ru
        trg = batch.en
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        epoch_history.append(loss.cpu().data.numpy())
        
    return epoch_loss / len(iterator), epoch_history

In [209]:
import time
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train_with_evaluation(model, epoch_num, optimizer, criterion, clip, train_iterator, test_iterator, writer):
    train_history = []
    test_history = []
    train_epoch_history = []
    test_bleu = []
    best_valid_loss = 9999.99
    
    for epoch in range(epoch_num):
    
        start_time = time.time()

        train_loss, epoch_history = train(model, train_iterator, optimizer, criterion, clip)
        valid_loss, epoch_bleu = evaluate(model, test_iterator, criterion)
        
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut1-model.pt')

        train_history.append(train_loss)
        test_history.append(valid_loss)
        train_epoch_history.append(epoch_history)
        test_bleu.append(epoch_bleu)

        writer.writerow(
            (epoch+1,                           # epoch num
             f'{epoch_mins}:{epoch_secs}',      # epoch time
             f'{train_loss:.3f}',               # train loss
             f'{math.exp(train_loss):7.3f}',    # ppl train
             f'{valid_loss:.3f}',               # valid loss
             f'{math.exp(valid_loss):7.3f}',    # ppl valid
             f'{epoch_bleu:.3f}'))              # valid bleu
        
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
        print(f'\t Val. Avg. BLEU: {epoch_bleu:.3f}')
    return train_history, test_history, train_epoch_history, test_bleu

In [210]:
CLIP = 1.0
EPOCH_N = 10
METRICS_FILE = 'metrics.csv'

with open(METRICS_FILE,'w') as file:
    csvwriter = csv.writer(file, delimiter=';')
    csvwriter.writerow(('epoch', 'epoch time', 'train loss', 'train ppl', 'val loss', 'val ppl', 'val bleu'))
    train_history, test_history, train_epochs_history, test_bleu = train_with_evaluation(
        model, EPOCH_N, optimizer, criterion, CLIP,  check_iterator, check_iterator, csvwriter)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:48<00:00,  6.10s/it]


Epoch: 01 | Time: 0m 53s
	Train Loss: 6.774 | Train PPL: 874.791
	 Val. Loss: 5.813 |  Val. PPL: 334.580
	 Val. Avg. BLEU: 0.000
