In [1]:
%matplotlib inline

In [2]:
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = s.replace(".", "")
    s = s.replace(",", "")
    s = s.replace("!", "")
    s = s.replace(" !", "")
    s = s.replace("?", "")
    return s

In [5]:
lines = open("drive/MyDrive/rus-eng/rus.txt", encoding='utf-8').read().strip().split('\n')
loaded_pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in lines]

loaded_pairs[-30][0], loaded_pairs[-30][1]

('if you take a child outside and point at the moon the child will look at the moon if you do the same with a dog it will look at your finger',
 'если вы выведете ребенка и покажете ему пальцем на луну ребенок посмотрит на луну если вы то же самое проделаете с собакои она будет смотреть на ваш палец')

In [6]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
SOS_token = 0
EOS_token = 1

In [8]:
MAX_LENGTH = 4

In [9]:
pairs = []

prev_pair = ['', '']
for pair in loaded_pairs:
    if pair[0] == prev_pair[0]: continue
    prev_pair = pair
    if len(pair[0].split(' ')) >= MAX_LENGTH: continue
    if len(pair[1].split(' ')) >= MAX_LENGTH: continue

    pairs.append(pair)

pairs[:10]

[['go', 'марш'],
 ['hi', 'здравствуите'],
 ['run', 'беги'],
 ['who', 'кто'],
 ['wow', 'вот это да'],
 ['duck', 'пригнись'],
 ['fire', 'огонь'],
 ['help', 'помогите'],
 ['hide', 'прячься'],
 ['jump', 'прыгаи']]

In [10]:
input_lang = Lang("en")
output_lang = Lang("ru")

In [11]:
for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])

print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

Counted words:
en 5186
ru 9012


The Encoder
-----------





In [23]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.rnn(output, hidden)
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------




In [34]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [35]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [36]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [37]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [38]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [39]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [40]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [41]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [42]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 150000, print_every=5000)

0m 46s (- 22m 14s) (5000 3%) 5.0632
1m 21s (- 19m 1s) (10000 6%) 4.5664
1m 57s (- 17m 33s) (15000 10%) 4.3022
2m 34s (- 16m 44s) (20000 13%) 4.0995
3m 11s (- 15m 56s) (25000 16%) 3.9519
3m 47s (- 15m 11s) (30000 20%) 3.8637
4m 24s (- 14m 30s) (35000 23%) 3.7491
5m 1s (- 13m 48s) (40000 26%) 3.6567
5m 36s (- 13m 5s) (45000 30%) 3.5563
6m 11s (- 12m 23s) (50000 33%) 3.4822
6m 47s (- 11m 43s) (55000 36%) 3.3997
7m 22s (- 11m 4s) (60000 40%) 3.3171
7m 58s (- 10m 25s) (65000 43%) 3.2776
8m 34s (- 9m 47s) (70000 46%) 3.2370
9m 10s (- 9m 10s) (75000 50%) 3.1374
9m 48s (- 8m 34s) (80000 53%) 3.0814
10m 25s (- 7m 58s) (85000 56%) 3.0232
11m 1s (- 7m 20s) (90000 60%) 2.9799
11m 37s (- 6m 43s) (95000 63%) 2.9361
12m 12s (- 6m 6s) (100000 66%) 2.9049
12m 48s (- 5m 29s) (105000 70%) 2.8499
13m 25s (- 4m 52s) (110000 73%) 2.8175
14m 0s (- 4m 15s) (115000 76%) 2.7768
14m 36s (- 3m 39s) (120000 80%) 2.7409
15m 12s (- 3m 2s) (125000 83%) 2.6982
15m 48s (- 2m 25s) (130000 86%) 2.6952
16m 23s (- 1m 49s) 

In [43]:
evaluateRandomly(encoder1, decoder1)

> tom fixed everything
= том все починил
< том все слышал <EOS>

> tom is falling
= том падает
< том разведен <EOS>

> we're both grandfathers
= мы оба дедушки
< мы оба друзья <EOS>

> we're fasting
= мы постимся
< мы чувствительные <EOS>

> tom don't die
= не умираи том
< том не жадныи <EOS>

> that's a wolf
= это волк
< это самое <EOS>

> it's so weird
= это так странно
< это так странно <EOS>

> i was poor
= я был беден
< я был спокоен <EOS>

> hold this
= подержи это
< хватит <EOS>

> you're early
= ты рано
< ты права <EOS>

