In [1]:
from io import open
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import pandas as pd

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# 1. Загрузка и подготовка данных

In [3]:
!tail rus-eng/rus.txt

We need to uphold laws against discrimination — in hiring, and in housing, and in education, and in the criminal justice system. That is what our Constitution and our highest ideals require.	Нам нужно отстаивать законы против дискриминации при найме на работу, в жилищной сфере, в сфере образования и правоохранительной системе. Этого требуют наша Конституция и высшие идеалы.	CC-BY 2.0 (France) Attribution: tatoeba.org #5762728 (BHO) & #6390439 (odexed)
I've heard that you should never date anyone who is less than half your age plus seven. Tom is now 30 years old and Mary is 17. How many years will Tom need to wait until he can start dating Mary?	Я слышал, что никогда не следует встречаться с кем-то вдвое младше вас плюс семь лет. Тому 30 лет, a Мэри 17. Сколько лет Тому нужно ждать до тех пор, пока он сможет начать встречаться с Мэри?	CC-BY 2.0 (France) Attribution: tatoeba.org #10068197 (CK) & #10644473 (notenoughsun)
I do have one final ask of you as your president, the same thing I a

In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-ЯёЁ.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s)
    s = s.strip()
    return s

In [25]:
def readLangs(lang1, lang2, file_name, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(file_name, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(l.split('\t')[0]), normalizeString(l.split('\t')[1])] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [24]:
def prepareData(lang1, lang2, file_name, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, file_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

file_name = 'rus-eng/rus.txt'
input_lang, output_lang, pairs = prepareData('eng', 'rus', file_name, True)
print(random.choice(pairs))

Reading lines...
Read 496059 sentence pairs
Trimmed to 28336 sentence pairs
Counting words...
Counted words:
rus 9895
eng 4171
['он играет у себя в комнате .', 'he is playing in his room .']


# 2. Код обучения модели

In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, rnnClass, input_size, hidden_size, num_layers=1, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnnClass = rnnClass

        self.embedding = nn.Embedding(input_size, self.hidden_size)
        self.rnn = rnnClass(self.hidden_size, self.hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded)
        return output, hidden

In [27]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


def get_dataloader(batch_size, input_lang, output_lang, pairs):
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return train_dataloader

In [28]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [29]:
def train(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    target_length = len(dataloader)

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)

        decoder_outputs, decoder_hidden, decoder_attention = decoder(
            encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1))

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()
    return total_loss / target_length

In [30]:
def trainIters(train_dataloader, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, model_name=''):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0 

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):

        loss = train(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

    return model_name, n_iters + 1, time.time() - start, print_loss_avg

In [31]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad(): 
        input_tensor = tensorFromSentence(input_lang, sentence)  
        encoder_outputs, encoder_hidden = encoder(input_tensor) 
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden) 
         
        _, topi = decoder_outputs.topk(1) 
        decoded_ids = topi.squeeze()  

        decoded_words = [] 
        for idx in decoded_ids: 
            if idx.item() == EOS_token: 
                decoded_words.append('<EOS>') 
                break 
            decoded_words.append(output_lang.index2word[idx.item()]) 
    
    return decoded_words, decoder_attn

In [32]:
def evaluateRandomly(encoder, decoder, input_lang, output_lang, n=10): 
    for i in range(n): 
        pair = random.choice(pairs) 
        print('>', pair[0]) 
        print('=', pair[1]) 
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang) 
        output_sentence = ' '.join(output_words) 
        print('<', output_sentence) 
        print('')

In [33]:
df_result = pd.DataFrame(columns=['model', 'epochs', 'time', 'loss'])

In [34]:
hidden_size = 128
batch_size = 16

train_dataloader = get_dataloader(batch_size, input_lang, output_lang, pairs)

# 3. Sequence2Sequence Attention (на основе скалярного произведения)

In [35]:
class DotAttention(nn.Module):
    def __init__(self, hidden_size, dropout=0.1):
        super(DotAttention, self).__init__()
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, keys, valid_lens=None):
        scores = torch.bmm(query, keys.transpose(1, 2)) / math.sqrt(query.shape[-1])

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(self.dropout(weights), keys)
        return context, weights

class AttnDot_DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDot_DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = DotAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input, hidden, target_tensor=None):
        decoder_input = torch.empty(input.size(0), 1, dtype=torch.long, device=device).fill_(SOS_token)
        outputs = []
        attentions = []

        for i in range(MAX_LENGTH):

            embedding = self.dropout(self.embedding(decoder_input))
            context, attn_weights = self.attention(hidden.permute(1, 0, 2), input)
            output, hidden = self.gru(torch.cat((embedding, context), dim=2), hidden)

            outputs.append(self.out(output))
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        outputs = F.log_softmax(torch.cat(outputs, dim=1), dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return outputs, hidden, attentions

In [36]:
encoder1 = EncoderRNN(nn.GRU, input_lang.n_words, hidden_size).to(device)
decoder1 = AttnDot_DecoderRNN(hidden_size, output_lang.n_words).to(device)

df_result.loc[len(df_result)] = trainIters(train_dataloader, encoder1, decoder1, 500, print_every=50, model_name='Scaled dot product attention')

30m 40s (- 276m 4s) (50 10%) 2.0279
58m 17s (- 233m 9s) (100 20%) 1.5455
87m 20s (- 203m 48s) (150 30%) 1.3327
117m 3s (- 175m 35s) (200 40%) 1.1809
147m 9s (- 147m 9s) (250 50%) 1.0636
177m 37s (- 118m 24s) (300 60%) 0.9695
208m 31s (- 89m 22s) (350 70%) 0.8915
239m 15s (- 59m 48s) (400 80%) 0.8256
269m 8s (- 29m 54s) (450 90%) 0.7696
298m 49s (- 0m 0s) (500 100%) 0.7205


In [37]:
encoder1.eval() 
decoder1.eval()

evaluateRandomly(encoder1, decoder1, input_lang, output_lang)

> меня беспокоит здоровье моеи матери .
= i m worried about my mother s health .
< i am concerned for to me me me me .

> ты шутишь !
= you re kidding !
< you re you you you you you you . .

> он быстрыи .
= he s fast .
< he am as i . . . . . .

> я жду когда откроется магазин .
= i m waiting for the store to open .
< i re waiting on on her . . . .

> меня интересует изучение немецкои культуры .
= i am interested in studying german culture .
< i m my my my my . . job job

> вы еще молоды и неопытны .
= you re still young and inexperienced .
< you re is you you you you <EOS>

> вы еще здесь .
= you re still here .
< you re here here here you here . . .

> я танцовщица .
= i m a dancer .
< i aren i i as . . . child .

> мы с томом хорошие друзья .
= i m good friends with tom .
< i m about with to to for . . .

> он ответственен за это .
= he is responsible for it .
< he is all for for for that . . .



# 4. Sequence2Sequence Attention (на основе MLP)

In [38]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnMLP_DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnMLP_DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input, hidden, target_tensor=None):
        decoder_input = torch.empty(input.size(0), 1, dtype=torch.long, device=device).fill_(SOS_token)
        outputs = []
        attentions = []

        for i in range(MAX_LENGTH):

            embedding = self.dropout(self.embedding(decoder_input))
            context, attn_weights = self.attention(hidden.permute(1, 0, 2), input)
            output, hidden = self.gru(torch.cat((embedding, context), dim=2), hidden)
            output = self.out(output)
            outputs.append(output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        outputs = F.log_softmax(torch.cat(outputs, dim=1), dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return outputs, hidden, attentions

In [40]:
encoder2 = EncoderRNN(nn.GRU, input_lang.n_words, hidden_size).to(device)
decoder2 = AttnMLP_DecoderRNN(hidden_size, output_lang.n_words).to(device)

df_result.loc[len(df_result)] = trainIters(train_dataloader, encoder2, decoder2, 500, print_every=50, model_name='MLP attention')

30m 40s (- 276m 8s) (50 10%) 1.9864
61m 22s (- 245m 29s) (100 20%) 1.4813
93m 12s (- 217m 30s) (150 30%) 1.2704
123m 35s (- 185m 23s) (200 40%) 1.1173
153m 59s (- 153m 59s) (250 50%) 1.0012
185m 11s (- 123m 27s) (300 60%) 0.9088
215m 12s (- 92m 13s) (350 70%) 0.8330
245m 0s (- 61m 15s) (400 80%) 0.7694
274m 33s (- 30m 30s) (450 90%) 0.7156
303m 59s (- 0m 0s) (500 100%) 0.6692


In [41]:
encoder2.eval() 
decoder2.eval()

evaluateRandomly(encoder2, decoder2, input_lang, output_lang)

> мне правда страшно .
= i m really scared .
< i m really tired . <EOS>

> это человек на которого ты можешь положиться .
= he s a man you can rely on .
< he is an expert man you can do . <EOS>

> ты успешен .
= you re successful .
< you re free at the right eye . <EOS>

> мы почти все .
= we re almost done .
< we re almost there all . <EOS>

> ты ужасно поешь .
= you re a horrible singer .
< you re a lot like the world . <EOS>

> я рад принять ваше приглашение .
= i m glad to accept your invitation .
< i m glad to hear your voice . <EOS>

> я устала этим заниматься .
= i m tired of doing this .
< i m tired of doing this too . <EOS>

> он славныи .
= he is nice .
< he is a safe taller than his brother . <EOS>

> я рад что приехал в австралию .
= i m glad that i came to australia .
< i m glad i m back to australia . <EOS>

> ты очень открытая .
= you re very open .
< you re very very your brother to drive . <EOS>



# 5. Сводная таблица

In [42]:
df_result.head()

Unnamed: 0,model,epochs,time,loss
0,Scaled dot product attention,501,17930.004089,0.720537
1,MLP attention,501,18239.724421,0.669153
