In [6]:
%matplotlib inline

In [7]:
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = s.replace(".", "")
    s = s.replace(",", "")
    s = s.replace("!", "")
    s = s.replace(" !", "")
    s = s.replace("?", "")
    return s

In [10]:
lines = open("drive/MyDrive/rus-eng/rus.txt", encoding='utf-8').read().strip().split('\n')
loaded_pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in lines]

loaded_pairs[-30][0], loaded_pairs[-30][1]

('if you take a child outside and point at the moon the child will look at the moon if you do the same with a dog it will look at your finger',
 'если вы выведете ребенка и покажете ему пальцем на луну ребенок посмотрит на луну если вы то же самое проделаете с собакои она будет смотреть на ваш палец')

In [11]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [12]:
SOS_token = 0
EOS_token = 1

In [13]:
MAX_LENGTH = 4

In [14]:
pairs = []

prev_pair = ['', '']
for pair in loaded_pairs:
    if pair[0] == prev_pair[0]: continue
    prev_pair = pair
    if len(pair[0].split(' ')) >= MAX_LENGTH: continue
    if len(pair[1].split(' ')) >= MAX_LENGTH: continue

    pairs.append(pair)

pairs[:10]

[['go', 'марш'],
 ['hi', 'здравствуите'],
 ['run', 'беги'],
 ['who', 'кто'],
 ['wow', 'вот это да'],
 ['duck', 'пригнись'],
 ['fire', 'огонь'],
 ['help', 'помогите'],
 ['hide', 'прячься'],
 ['jump', 'прыгаи']]

In [15]:
input_lang = Lang("en")
output_lang = Lang("ru")

In [16]:
for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])

print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

Counted words:
en 5186
ru 9012


The Encoder
-----------





In [17]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------




In [18]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [20]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [21]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [22]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [23]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [24]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [25]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [26]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 150000, print_every=5000)

0m 36s (- 17m 44s) (5000 3%) 4.6872
1m 3s (- 14m 48s) (10000 6%) 4.0164
1m 30s (- 13m 33s) (15000 10%) 3.6546
1m 58s (- 12m 48s) (20000 13%) 3.4084
2m 25s (- 12m 7s) (25000 16%) 3.1292
2m 53s (- 11m 32s) (30000 20%) 2.9098
3m 20s (- 10m 59s) (35000 23%) 2.7115
3m 48s (- 10m 27s) (40000 26%) 2.4983
4m 16s (- 9m 57s) (45000 30%) 2.3400
4m 43s (- 9m 27s) (50000 33%) 2.1770
5m 11s (- 8m 58s) (55000 36%) 2.0568
5m 39s (- 8m 29s) (60000 40%) 1.8836
6m 7s (- 8m 0s) (65000 43%) 1.7590
6m 35s (- 7m 31s) (70000 46%) 1.6282
7m 3s (- 7m 3s) (75000 50%) 1.5144
7m 31s (- 6m 35s) (80000 53%) 1.4275
7m 59s (- 6m 7s) (85000 56%) 1.3097
8m 27s (- 5m 38s) (90000 60%) 1.2271
8m 55s (- 5m 10s) (95000 63%) 1.1289
9m 23s (- 4m 41s) (100000 66%) 1.0688
9m 51s (- 4m 13s) (105000 70%) 1.0267
10m 19s (- 3m 45s) (110000 73%) 0.9374
10m 47s (- 3m 16s) (115000 76%) 0.8608
11m 15s (- 2m 48s) (120000 80%) 0.8244
11m 43s (- 2m 20s) (125000 83%) 0.7501
12m 11s (- 1m 52s) (130000 86%) 0.6855
12m 38s (- 1m 24s) (135000 9

In [27]:
evaluateRandomly(encoder1, decoder1)

> we know enough
= мы знаем достаточно
< мы знаем достаточно <EOS>

> she is attractive
= она привлекательна
< она привлекательна <EOS>

> open the window
= открои окно
< открои окно <EOS>

> it's very dark
= очень темно
< очень темно <EOS>

> i hated boston
= я ненавидел бостон
< я ненавидел бостон <EOS>

> tom was thirsty
= том хотел пить
< том хотел пить <EOS>

> have some tea
= выпеите чаю
< выпеите чаю <EOS>

> tom spoke first
= том заговорил первым
< том заговорил первым <EOS>

> he isn't tom
= это не том
< это не том <EOS>

> i'm a bookworm
= я книжныи червь
< я пишу <EOS>

