In [143]:
%matplotlib inline

In [144]:
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [145]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [146]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = s.replace(".", "")
    s = s.replace(",", "")
    s = s.replace("!", "")
    s = s.replace(" !", "")
    s = s.replace("?", "")
    return s

In [147]:
lines = open("drive/MyDrive/rus-eng/rus.txt", encoding='utf-8').read().strip().split('\n')
loaded_pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in lines]

loaded_pairs[-30][0], loaded_pairs[-30][1]

('if you take a child outside and point at the moon the child will look at the moon if you do the same with a dog it will look at your finger',
 'если вы выведете ребенка и покажете ему пальцем на луну ребенок посмотрит на луну если вы то же самое проделаете с собакои она будет смотреть на ваш палец')

In [148]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [149]:
SOS_token = 0
EOS_token = 1

In [150]:
MAX_LENGTH = 4

In [151]:
pairs = []

prev_pair = ['', '']
for pair in loaded_pairs:
    if pair[0] == prev_pair[0]: continue
    prev_pair = pair
    if len(pair[0].split(' ')) >= MAX_LENGTH: continue
    if len(pair[1].split(' ')) >= MAX_LENGTH: continue

    pairs.append(pair)

pairs[:10]

[['go', 'марш'],
 ['hi', 'здравствуите'],
 ['run', 'беги'],
 ['who', 'кто'],
 ['wow', 'вот это да'],
 ['duck', 'пригнись'],
 ['fire', 'огонь'],
 ['help', 'помогите'],
 ['hide', 'прячься'],
 ['jump', 'прыгаи']]

In [152]:
input_lang = Lang("en")
output_lang = Lang("ru")

In [153]:
for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])

print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

Counted words:
en 5186
ru 9012


The Encoder
-----------





In [154]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, (hidden, _) = self.lstm(output, (hidden, hidden))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------




In [155]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, (hidden, _) = self.lstm(output, (hidden, hidden))
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [156]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [157]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [158]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [159]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [160]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [161]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [162]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [163]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 150000, print_every=5000)

0m 34s (- 16m 52s) (5000 3%) 5.0351
1m 1s (- 14m 18s) (10000 6%) 4.9574
1m 27s (- 13m 8s) (15000 10%) 4.7601
1m 53s (- 12m 20s) (20000 13%) 4.4711
2m 20s (- 11m 40s) (25000 16%) 4.2417
2m 46s (- 11m 5s) (30000 20%) 4.0680
3m 13s (- 10m 35s) (35000 23%) 3.8825
3m 40s (- 10m 5s) (40000 26%) 3.7422
4m 6s (- 9m 35s) (45000 30%) 3.6018
4m 33s (- 9m 6s) (50000 33%) 3.4724
4m 59s (- 8m 37s) (55000 36%) 3.3294
5m 26s (- 8m 9s) (60000 40%) 3.2352
5m 53s (- 7m 42s) (65000 43%) 3.0793
6m 20s (- 7m 14s) (70000 46%) 3.0016
6m 47s (- 6m 47s) (75000 50%) 2.8895
7m 14s (- 6m 20s) (80000 53%) 2.7964
7m 41s (- 5m 53s) (85000 56%) 2.6961
8m 8s (- 5m 25s) (90000 60%) 2.6007
8m 36s (- 4m 59s) (95000 63%) 2.4987
9m 4s (- 4m 32s) (100000 66%) 2.3825
9m 31s (- 4m 4s) (105000 70%) 2.2888
9m 58s (- 3m 37s) (110000 73%) 2.2301
10m 25s (- 3m 10s) (115000 76%) 2.1227
10m 53s (- 2m 43s) (120000 80%) 2.0644
11m 20s (- 2m 16s) (125000 83%) 1.9531
11m 48s (- 1m 48s) (130000 86%) 1.8723
12m 15s (- 1m 21s) (135000 90%) 

In [164]:
evaluateRandomly(encoder1, decoder1)

> you must go
= ты должен поити
< ты должен помочь <EOS>

> you didn't look
= ты не посмотрел
< ты не посмотрел <EOS>

> we're deluding ourselves
= мы обманываем себя
< мы будем себя <EOS>

> we're tired
= мы устали
< мы устали <EOS>

> i felt cold
= мне было холодно
< мне холодно <EOS>

> everyone noticed that
= все это заметили
< все это заметили <EOS>

> he's the scapegoat
= он козел отпущения
< он козел <EOS>

> we're too weak
= мы слишком слабы
< мы слишком закончили <EOS>

> i'll arrange that
= я это улажу
< я это <EOS>

> it's really clean
= оно очень чистое
< это очень тяжело <EOS>

