In [1]:
from io import open
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# 1. Загрузка и подготовка данных

In [9]:
!tail rus.txt

We need to uphold laws against discrimination — in hiring, and in housing, and in education, and in the criminal justice system. That is what our Constitution and our highest ideals require.	Нам нужно отстаивать законы против дискриминации при найме на работу, в жилищной сфере, в сфере образования и правоохранительной системе. Этого требуют наша Конституция и высшие идеалы.	CC-BY 2.0 (France) Attribution: tatoeba.org #5762728 (BHO) & #6390439 (odexed)
I've heard that you should never date anyone who is less than half your age plus seven. Tom is now 30 years old and Mary is 17. How many years will Tom need to wait until he can start dating Mary?	Я слышал, что никогда не следует встречаться с кем-то вдвое младше вас плюс семь лет. Тому 30 лет, a Мэри 17. Сколько лет Тому нужно ждать до тех пор, пока он сможет начать встречаться с Мэри?	CC-BY 2.0 (France) Attribution: tatoeba.org #10068197 (CK) & #10644473 (notenoughsun)
I do have one final ask of you as your president, the same thing I a

In [10]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [11]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-ЯёЁ.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s)
    return s

In [12]:
def readLangs(lang1, lang2, file_name, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(file_name, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(l.split('\t')[0]), normalizeString(l.split('\t')[1])] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [13]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [14]:
def prepareData(lang1, lang2, file_name, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, file_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

file_name = 'rus.txt'
input_lang, output_lang, pairs = prepareData('eng', 'rus', file_name, True)
print(random.choice(pairs))

Reading lines...
Read 496059 sentence pairs
Trimmed to 28719 sentence pairs
Counting words...
Counted words:
rus 10177
eng 4303
['ты наша заложница .', 'you re our hostage .']


# 2. Код обучения модели

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, rnnClass, input_size, hidden_size, num_layers=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnnClass = rnnClass

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = rnnClass(hidden_size, hidden_size, num_layers=num_layers)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self):
        hidden = torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
        if 'LSTM' in str(self.rnnClass): hidden = tuple([hidden, hidden])
        return hidden

In [16]:
class DecoderRNN(nn.Module):
    def __init__(self, rnnClass, hidden_size, output_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnnClass = rnnClass

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = rnnClass(hidden_size, hidden_size, num_layers=num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        hidden = torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
        if 'LSTM' in str(self.rnnClass): hidden = tuple([hidden, hidden])
        return hidden

In [17]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [18]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [19]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [20]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, model_name=''):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

    return model_name, n_iters + 1, time.time() - start, print_loss_avg

In [21]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [22]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [23]:
df_result = pd.DataFrame(columns=['model', 'epochs', 'time', 'loss'])

# 3. GRU

In [77]:
hidden_size = 256
encoder1 = EncoderRNN(nn.GRU, input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(nn.GRU, hidden_size, output_lang.n_words).to(device)

df_result.loc[len(df_result)] = trainIters(encoder1, decoder1, 50000, print_every=5000, model_name='GRU')

1m 12s (- 10m 52s) (5000 10%) 3.1544
2m 22s (- 9m 31s) (10000 20%) 2.6198
3m 33s (- 8m 18s) (15000 30%) 2.3808
4m 45s (- 7m 7s) (20000 40%) 2.1705
5m 56s (- 5m 56s) (25000 50%) 2.0014
7m 7s (- 4m 45s) (30000 60%) 1.8736
8m 18s (- 3m 33s) (35000 70%) 1.7440
9m 30s (- 2m 22s) (40000 80%) 1.6691
10m 41s (- 1m 11s) (45000 90%) 1.5840
11m 53s (- 0m 0s) (50000 100%) 1.4906


In [78]:
evaluateRandomly(encoder1, decoder1)

> я нахожусь в тои же лодке .
= i m in the same boat .
< i m the the in the . . <EOS>

> они в шкафу висят .
= they re hanging up in the closet .
< they are out out of town . <EOS>

> он сильныи .
= he is powerful .
< he s stupid . <EOS>

> тому помогал я .
= i m the one who helped tom .
< i m the one who tom tom . <EOS>

> мы сеичас все вместе .
= we re all together right now .
< we re all together together . <EOS>

> ему гораздо лучше .
= he s feeling much better .
< he s much better . . . <EOS>

> я вам ничего не говорю .
= i m not telling you anything .
< i m not telling anything anything . <EOS>

> он боится плавать .
= he is afraid of swimming .
< he s afraid to death . <EOS>

> ты не смотришь .
= you aren t looking .
< you re not a . <EOS>

> удивительно что ты так наивна .
= i m surprised you re so naive .
< i m surprised that you re so naive . <EOS>



# 4. GRU (num_layers=2)

In [79]:
hidden_size = 256
num_layers = 2

encoder1 = EncoderRNN(nn.GRU, input_lang.n_words, hidden_size, num_layers).to(device)
decoder1 = DecoderRNN(nn.GRU, hidden_size, output_lang.n_words, num_layers).to(device)

df_result.loc[len(df_result)] = trainIters(encoder1, decoder1, 50000, print_every=5000, model_name='GRU (num_layers=2)')

1m 30s (- 13m 37s) (5000 10%) 3.1207
3m 1s (- 12m 7s) (10000 20%) 2.6742
4m 32s (- 10m 36s) (15000 30%) 2.4334
6m 4s (- 9m 7s) (20000 40%) 2.2438
7m 35s (- 7m 35s) (25000 50%) 2.0753
9m 7s (- 6m 5s) (30000 60%) 1.9629
10m 39s (- 4m 34s) (35000 70%) 1.8178
12m 11s (- 3m 2s) (40000 80%) 1.7035
13m 42s (- 1m 31s) (45000 90%) 1.5651
15m 14s (- 0m 0s) (50000 100%) 1.5563


In [80]:
evaluateRandomly(encoder1, decoder1)

> я рад видеть что ты вернулся .
= i m glad to see you back .
< i m glad to see you re . <EOS>

> я очень рад что вы здесь .
= i m very glad that you re here .
< i m very glad you re re here . <EOS>

> я отдаю тебе все что могу .
= i m giving you all i can .
< i m not telling you what i can . <EOS>

> я не силен в самопродвижении .
= i m not good at self promotion .
< i m not in good room . <EOS>

> я спрашиваю твое мнение .
= i m asking your opinion .
< i m asking your help . <EOS>

> нам понадобится твоя помощь .
= we re going to need your help .
< we re going to need help help . <EOS>

> ты очень высокии .
= you re very tall .
< you re very tall . <EOS>

> вы не канадка .
= you re not canadian .
< you re not canadian . <EOS>

> я так счастлив это слышать .
= i m so happy to hear that .
< i m so happy that hear it . <EOS>

> они намного младше тома .
= they are much younger than tom .
< they re much than than tom . <EOS>



# 5. LSTM

In [81]:
hidden_size = 256
num_layers = 2

encoder1 = EncoderRNN(nn.LSTM, input_lang.n_words, hidden_size, num_layers).to(device)
decoder1 = DecoderRNN(nn.LSTM, hidden_size, output_lang.n_words, num_layers).to(device)

df_result.loc[len(df_result)] = trainIters(encoder1, decoder1, 50000, print_every=5000, model_name='LSTM')

1m 40s (- 15m 3s) (5000 10%) 3.3594
3m 20s (- 13m 23s) (10000 20%) 2.8512
5m 0s (- 11m 42s) (15000 30%) 2.6673
6m 42s (- 10m 3s) (20000 40%) 2.5197
8m 22s (- 8m 22s) (25000 50%) 2.3702
10m 5s (- 6m 43s) (30000 60%) 2.2436
11m 46s (- 5m 2s) (35000 70%) 2.1217
13m 28s (- 3m 22s) (40000 80%) 2.0194
15m 9s (- 1m 41s) (45000 90%) 1.9102
16m 52s (- 0m 0s) (50000 100%) 1.8241


In [83]:
evaluateRandomly(encoder1, decoder1)

> мы так за тебя рады .
= we re so happy for you .
< we re so happy for you . <EOS>

> я ничуть не умнее тома .
= i m not any smarter than tom .
< i m not younger than tom . <EOS>

> я в этом весьма заинтересован .
= i m pretty interested in it .
< i m sorry of this with . <EOS>

> она очаровательная женщина .
= she is a charming woman .
< she is a good . . <EOS>

> я все еще одинок .
= i am still alone .
< i m still angry . <EOS>

> она настоящая сплетница .
= she s a real gossip .
< she is a . . . <EOS>

> вы точны .
= you re precise .
< you re the . <EOS>

> мы идеально подходим друг другу .
= we re a perfect match .
< we re the best friend . <EOS>

> на неи синее платье .
= she is wearing a blue dress .
< she is two years older . <EOS>

> я не отец тома .
= i m not tom s father .
< i m not tom s s . <EOS>



# 6. Сводная таблица

In [82]:
df_result.head()

Unnamed: 0,model,epochs,time,loss
0,GRU,50001,713.082669,1.490574
1,GRU (num_layers=2),50001,914.910802,1.55632
2,LSTM,50001,1012.040815,1.824142
