In [1]:
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io
import numpy as np

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import nltk
from nltk.tokenize import word_tokenize

from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
print(device)

cuda


# Dataset
We used the Yandex corpus to train the model.

In [4]:
def build_vocab(filepath, language):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for i, string in enumerate(f):
      counter.update(word_tokenize(string, language=language))

      if i >= 20000: # We cannot add more words to the vocabulary because otherwise we will run out of resources
          break
  return Vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'])

ru_vocab = build_vocab('corp/corpus.en_ru.1m.ru', 'russian')
en_vocab = build_vocab('corp/corpus.en_ru.1m.en', 'english')

In [6]:
def make_pairs(en_file, ru_file):
    raw_en_iter = iter(io.open(en_file, encoding="utf8"))
    raw_ru_iter = iter(io.open(ru_file, encoding="utf8"))
    data = []
    for (raw_ru, raw_en) in zip(raw_ru_iter, raw_en_iter):
        ru_tensor = torch.tensor([ru_vocab[token] for token in word_tokenize(raw_ru, language='russian')],
                                dtype=torch.long)
        en_tensor = torch.tensor([en_vocab[token] for token in word_tokenize(raw_en, language='english')],
                                dtype=torch.long)
        data.append((ru_tensor, en_tensor))

    return data

pairs = make_pairs('corp/corpus.en_ru.1m.en', 'corp/corpus.en_ru.1m.ru')

In [7]:
PAD_IDX = ru_vocab['<pad>']
SOS_IDX = ru_vocab['<sos>']
EOS_IDX = ru_vocab['<eos>']

def generate_batch(data_batch):
    ru_batch, en_batch = [], []
    for (ru_item, en_item) in data_batch:
        ru_batch.append(torch.cat([torch.tensor([SOS_IDX]), ru_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([SOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    ru_batch = pad_sequence(ru_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return ru_batch, en_batch

In [8]:
from sklearn.model_selection import train_test_split

train_idx, valid_idx= train_test_split(
np.arange(len(pairs)),
test_size=0.1,
shuffle=True)

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)

train_loader = DataLoader(pairs, batch_size=64, 
                        sampler=train_sampler, collate_fn=generate_batch)
valid_loader = DataLoader(pairs, batch_size=64, 
                        sampler=valid_sampler, collate_fn=generate_batch)

# Model

In [9]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

In [None]:
INPUT_DIM = len(ru_vocab)
OUTPUT_DIM = len(en_vocab)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ATTN_DIM = 64
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim ,enc_hid_dim, dec_hid_dim, dropout = 0.5):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim, attn_dim):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self, decoder_hidden, encoder_outputs):

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self, decoder_hidden, encoder_outputs):

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self, input, decoder_hidden, encoder_outputs):

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [None]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)


def init_weights(model):
    for name, param in model.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

# Training

In [None]:
def train(model, loader, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for src, trg in tqdm(loader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(loader)

def evaluate(model, loader, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for src, trg in tqdm(loader):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(loader)

In [None]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = en_vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

N_EPOCHS = 3
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(f'Epoch: {epoch+1}/{N_EPOCHS}')
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    torch.save(model.state_dict(), '/seq2seq.h5')

    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

In [11]:
#model.load_state_dict(torch.load('seq2seq.h5'))

<All keys matched successfully>

# Text translation

In [16]:
results = []
model.eval()
PAD_IDX = ru_vocab['<pad>']
SOS_IDX = ru_vocab['<sos>']
EOS_IDX = ru_vocab['<eos>']

with torch.no_grad():
    k = 0
    with io.open('/content/eval-ru-100.txt', encoding="utf8") as f:
        for string in f:
            ru_tensor = torch.tensor([ru_vocab[token] for token in word_tokenize(string, language='russian')],
                                    dtype=torch.long)
            ru_tensor = [torch.cat([torch.tensor([SOS_IDX]), ru_tensor, torch.tensor([EOS_IDX])], dim=0)]

            ru_tensor = pad_sequence(ru_tensor, padding_value=PAD_IDX)
            
            ru_tensor = ru_tensor.to(device)
            encoder_outputs, hidden = model.encoder(ru_tensor)

            outputs = torch.zeros(100, 1, len(en_vocab)).to(device)
            
            output = torch.tensor([en_vocab['<sos>']], dtype=torch.long).to(device)
            for t in range(1, 100):
                output, hidden = model.decoder(output, hidden, encoder_outputs)
                outputs[t] = output
                output = output.max(1)[1]
            outputs = outputs[1:].view(-1, outputs.shape[-1])
            res = ''
            for out in outputs:
                word = en_vocab.itos[out.max(0)[1]]
                if word == '<eos>':
                    break
                res += ' ' + word
            results.append(res)

In [17]:
with open('answer.txt', 'w+') as f:
    for i, line in enumerate(results):
        if i == 100:
            f.write("%s" % line)
        else:
            f.write("%s\n" % line)