1. Import all requirements

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import string

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2. Language class to help us with handling data

In [None]:
SOS_token = 0
EOS_token = 1
CAP_token = 2 # token for capitals


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "CAP"}
        self.n_words = 3  # count tokens

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

Helper functions to turn Unicode to ASCII and normalize string (make all lowercase, remove all non-letter characters)

In [None]:
import re

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.strip())
    new_s = re.findall(r"[\w']+|[.,!?;\-%]", s)
    normal_s = []
    for word in new_s:
#      if not word.lower() == word:
#        normal_s.append("<CAP>")
      normal_s.append(word.lower())
    return ' '.join(normal_s)

In [None]:
s = normalizeString("Hello-world.")
print(s)
print(len(s.split(" ")))

hello - world .
4


Read sentences from files with input and output languages

In [None]:
def readLangs():
    print("Reading lines...")   

    en_lines = open('corpora/corpus.en_ru.1m.en', encoding='utf-8').read().strip().split('\n')
    ru_lines = open('corpora/corpus.en_ru.1m.ru', encoding='utf-8').read().strip().split('\n')
    # make sentence pairs
    pairs = [(normalizeString(ru_lines[i]), normalizeString(en_lines[i])) 
            for i in range(len(ru_lines))]
    input_lang = Lang('rus')
    output_lang = Lang('eng')

    return input_lang, output_lang, pairs

3. Data preparation step

We read the sentences, fill languages with them. We also choose a certain amount of the data, which will speed up the process of learning and evaluating.

In [None]:
def prepareData(num):
    input_lang, output_lang, pairs = readLangs()
    print("Read %s sentence pairs" % len(pairs))
    pairs = random.sample(pairs, num)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData(1000)
print(random.choice(pairs))

Reading lines...
Read 1000000 sentence pairs
Trimmed to 1000 sentence pairs
Counting words...
Counted words:
rus 8935
eng 5811
('кнопка scan all forms of active project сканирует все формы приложения и выделяет из них те , которые содержат компоненты fibplus для работы с sql tpfibdataset , tpfibquery , tpfibupdateobject и tpfibstoredproc .', 'the button tscan all forms of active projectt scans all application forms and selects those which contain fibplus components for work with sql tpfibdataset , tpfibquery , tpfibupdateobject and tpfibstoredproc .')


Sanity check

In [None]:
print(random.choice(pairs))

('но они расчистят путь к нему .', 'but they will prepare the way to it .')


In [None]:
ru_lens = [len(pair[0].split(' ')) for pair in pairs]
en_lens = [len(pair[1].split(' ')) for pair in pairs]
MAX_LENGTH = max(max(ru_lens), max(en_lens))

In [None]:
print(MAX_LENGTH)

346


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.3


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 512
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 10000, print_every=100)

evaluateRandomly(encoder1, attn_decoder1)

2m 12s (- 217m 55s) (100 1%) 5.2910
4m 42s (- 230m 50s) (200 2%) 5.8530
7m 25s (- 240m 13s) (300 3%) 5.7353
9m 52s (- 236m 51s) (400 4%) 5.3150
12m 29s (- 237m 11s) (500 5%) 5.5364
15m 12s (- 238m 18s) (600 6%) 5.4340
17m 52s (- 237m 31s) (700 7%) 5.7021
20m 20s (- 233m 55s) (800 8%) 4.8859
22m 35s (- 228m 30s) (900 9%) 4.7367
25m 29s (- 229m 28s) (1000 10%) 5.8485
27m 50s (- 225m 18s) (1100 11%) 5.1094
30m 4s (- 220m 31s) (1200 12%) 4.4120
32m 36s (- 218m 11s) (1300 13%) 5.5062
35m 19s (- 217m 0s) (1400 14%) 5.3163
37m 41s (- 213m 35s) (1500 15%) 5.0222
40m 3s (- 210m 20s) (1600 16%) 5.0461
42m 31s (- 207m 36s) (1700 17%) 4.9758
45m 3s (- 205m 17s) (1800 18%) 5.4723
47m 40s (- 203m 15s) (1900 19%) 5.1186
50m 11s (- 200m 44s) (2000 20%) 5.1126
52m 35s (- 197m 51s) (2100 21%) 5.3526
55m 14s (- 195m 50s) (2200 22%) 5.4337
57m 42s (- 193m 11s) (2300 23%) 5.3959
60m 22s (- 191m 11s) (2400 24%) 5.2241
63m 21s (- 190m 3s) (2500 25%) 5.5609
65m 53s (- 187m 32s) (2600 26%) 5.0974
68m 32s (- 18

IndexError: ignored

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> с целью пополнения ресурснои базы в 2006 году в казахстане были объявлены открытые конкурсы на получение права недропользования по 111 объектам в том числе 23  по углеводородному сырью
= in 2006 to extend the resource base kazakhstan held open tenders for 111 mining fields including 23 hydrocarbon fields
< in to to to to to the other countries notably italy bodes well for russia in his own to and <EOS>

> глобальныи бизнес
= above the clouds flight takeoff
< the the and the the the <EOS>

> окно справа станет чуть более темным с наложеннои на него стрелкои указывающеи направо
= the upper window will get a little darker overlaid with an arrow pointing up
< the with a with the will a the the the the the with the the a the the with a <EOS>

> shooting часа в настоящее время на веблучшии дешевои насколько это возможно пожалуиста принимающеи все функции необходимые для спасения жизни много головнои боли в будущем
= taking time now to find the best possible cheap web hosting with all the f