# "Sequence to Sequence Learning with Neural Networks" paper implementation - https://arxiv.org/pdf/1409.3215.pdf

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os

import torch
import torch.utils.data
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from data_loader import Dataset
import argparse

In [2]:
SOS_token = 1 
EOS_token = 2 

args = {
    'lr': 0.01,
    'momentum': 0.9,
    'weight_decay': 5e-4,
    'gamma': 0.1,
    'epochs_per_lr_drop': 450,
    'num_epochs': 10,
    'batch_size': 32,
    'num_workers': 8,
    'num_epoch': 600,
    'cuda': True,
    'save_folder': os.path.expanduser('~/weights'),
    'epochs_per_save': 10,
    'batch_per_log': 10,
    'auto_encoder': True,
    'MAX_LENGTH': 10,
    'bidirectional': False,
    'hidden_size_decoder': 256,
    'num_layer_decoder': 1,
    'hidden_size_encoder': 256,
    'num_layer_encoder': 1,
    'teacher_forcing': False
}

if args['cuda']:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")

trainset = Dataset(phase='train', max_input_length=10, auto_encoder=args['auto_encoder'])

input_lang, output_lang = trainset.langs()

trainloader = torch.utils.data.DataLoader(trainset, batch_size=args['batch_size'],
                                          shuffle=True, num_workers=args['num_workers'], pin_memory=False, drop_last=True)
dataiter = iter(trainloader)

testset = Dataset(phase='test', max_input_length=10, auto_encoder=args['auto_encoder'])
testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                          shuffle=True, num_workers=1, pin_memory=False, drop_last=True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 11793 sentence pairs
Counting words...
Counted words:
eng 3117
eng 3117
['you re disobeying orders ', 'you re disobeying orders ']
Reading lines...
Read 135842 sentence pairs
Trimmed to 11793 sentence pairs
Counting words...
Counted words:
eng 3117
eng 3117
['he s an aristocrat ', 'he s an aristocrat ']


In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, input_size, batch_size, num_layers=1, bidirectional=False):
        super(EncoderRNN, self).__init__()
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim=hidden_size)

        if args['bidirectional']:
            self.lstm_forward = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
            self.lstm_backward = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
        else:
            self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)


    def forward(self, input, hidden):

        if args['bidirectional']:
            input_forward, input_backward = input
            hidden_forward, hidden_backward = hidden
            input_forward = self.embedding(input_forward).view(1, 1, -1)
            input_backward = self.embedding(input_backward).view(1, 1, -1)

            out_forward, (h_n_forward, c_n_forward) = self.lstm_forward(input_forward, hidden_forward)
            out_backward, (h_n_backward, c_n_backward) = self.lstm_backward(input_backward, hidden_backward)

            forward_state = (h_n_forward, c_n_forward)
            backward_state = (h_n_backward, c_n_backward)
            output_state = (forward_state, backward_state)

            return output_state
        else:
            embedded = self.embedding(input).view(1, 1, -1)
            rnn_input = embedded
            output, (h_n, c_n) = self.lstm(rnn_input, hidden)
            return output, (h_n, c_n)

    def init_hidden(self):

        if self.bidirectional:
            encoder_state = [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                                      torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]
            encoder_state = {"forward": encoder_state, "backward": encoder_state}
            return encoder_state
        else:
            encoder_state = [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                              torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]
            return encoder_state

In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, batch_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(input_size=hidden_size,
                            hidden_size=hidden_size, num_layers=1)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, (h_n, c_n) = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, (h_n, c_n)

    def initHidden(self):
        return [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]


class Linear(nn.Module):
    def __init__(self, bidirectional, hidden_size_encoder, hidden_size_decoder):
        super(Linear, self).__init__()
        self.bidirectional = bidirectional
        num_directions = int(bidirectional) + 1
        self.linear_connection_op = nn.Linear(
            num_directions * hidden_size_encoder, hidden_size_decoder)
        self.connection_possibility_status = num_directions * \
            hidden_size_encoder == hidden_size_decoder

    def forward(self, input):

        if self.connection_possibility_status:
            return input
        else:
            return self.linear_connection_op(input)


In [11]:
import time
import math
from transformations import reformat_tensor_mask


def train(input_tensor, target_tensor, mask_input, mask_target, encoder, decoder, bridge, encoder_optimizer, decoder_optimizer, bridge_optimizer, criterion, max_length=args['MAX_LENGTH']):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    bridge_optimizer.zero_grad()

    encoder_hiddens_last = []
    loss = 0
    for step_idx in range(args['batch_size']):
        encoder_hidden = encoder.init_hidden()
        input_tensor_step = input_tensor[:,
                                         step_idx][input_tensor[:, step_idx] != 0]
        input_length = input_tensor_step.size(0)

        if args['bidirectional']:
            encoder_outputs = torch.zeros(
                args['batch_size'], max_length, 2 * encoder.hidden_size, device=device)
            encoder_hidden_forward = encoder_hidden['forward']
            encoder_hidden_backward = encoder_hidden['backward']
            for ei in range(input_length):
                (encoder_hidden_forward, encoder_hidden_backward) = encoder(
                    (input_tensor_step[ei], input_tensor_step[input_length - 1 - ei]), (encoder_hidden_forward, encoder_hidden_backward))

            hn_forward, cn_forward = encoder_hidden_forward
            hn_backward, cn_backward = encoder_hidden_backward

            encoder_hn = torch.cat((hn_forward, hn_backward), 2)
            encoder_cn = torch.cat((cn_forward, cn_backward), 2)
            encoder_hn_last_layer = encoder_hn[-1].view(1, 1, -1)
            encoder_cn_last_layer = encoder_cn[-1].view(1, 1, -1)

            encoder_hidden = [encoder_hn_last_layer, encoder_cn_last_layer]

        else:
            encoder_outputs = torch.zeros(
                args['batch_size'], max_length, encoder.hidden_size, device=device)
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(
                    input_tensor_step[ei], encoder_hidden)
                encoder_outputs[step_idx, ei, :] = encoder_output[0, 0]

            hn, cn = encoder_hidden
            encoder_hn_last_layer = hn[-1].view(1, 1, -1)
            encoder_cn_last_layer = cn[-1].view(1, 1, -1)
            encoder_hidden = [encoder_hn_last_layer, encoder_cn_last_layer]

        encoder_hidden = [bridge(item) for item in encoder_hidden]
        encoder_hiddens_last.append(encoder_hidden)

    decoder_input = torch.tensor([SOS_token], device=device)
    decoder_hiddens = encoder_hiddens_last

    if args['teacher_forcing']:

        for step_idx in range(args['batch_size']):
            target_tensor_step = target_tensor[:,
                                               step_idx][target_tensor[:, step_idx] != 0]
            target_length = target_tensor_step.size(0)
            decoder_hidden = decoder_hiddens[step_idx]
            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)

                loss += criterion(decoder_output,
                                  target_tensor_step[di].view(1))
                decoder_input = target_tensor_step[di]

        loss = loss / args['batch_size']

    else:
        for step_idx in range(args['batch_size']):

            target_tensor_step = target_tensor[:,
                                               step_idx][target_tensor[:, step_idx] != 0]
            target_length = target_tensor_step.size(0)
            decoder_hidden = decoder_hiddens[step_idx]

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output,
                                  target_tensor_step[di].view(1))
                if decoder_input.item() == EOS_token:
                    break
        loss = loss / args['batch_size']

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


def train_iters(encoder, decoder, bridge, print_every=1000, plot_every=100, learning_rate=0.1):

    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    bridge_optimizer = optim.SGD(bridge.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    n_iters_per_epoch = int(len(trainset) / args['batch_size'])
    for i in range(args['num_epochs']):

        for iteration, data in enumerate(trainloader, 1):

            training_pair = data

            input_tensor = training_pair['sentence'][:, :, 0, :]
            input_tensor, mask_input = reformat_tensor_mask(input_tensor)

            target_tensor = training_pair['sentence'][:, :, 1, :]
            target_tensor, mask_target = reformat_tensor_mask(target_tensor)

            if device == torch.device("cuda"):
                input_tensor = input_tensor.cuda()
                target_tensor = target_tensor.cuda()

            loss = train(input_tensor, target_tensor, mask_input, mask_target, encoder,
                         decoder, bridge, encoder_optimizer, decoder_optimizer, bridge_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iteration % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (time_since(start, iteration / n_iters_per_epoch),
                                             iteration, iteration / n_iters_per_epoch * 100, print_loss_avg))

            if iteration % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

        print('####### Finished epoch %d of %d ########' %
              (i+1, args['num_epochs']))


In [12]:
encoder1 = EncoderRNN(args['hidden_size_encoder'], input_lang.n_words, args['batch_size'], num_layers=args['num_layer_encoder'], bidirectional=args['bidirectional']).to(device)
bridge = Linear(args['bidirectional'], args['hidden_size_encoder'], args['hidden_size_decoder']).to(device)
decoder1 = DecoderRNN(args['hidden_size_decoder'], output_lang.n_words, args['batch_size'], num_layers=args['num_layer_decoder']).to(device)

train_iters(encoder1, decoder1, bridge, print_every=10)

0m 7s (- 3m 33s) (10 3%) 5.9281
0m 14s (- 3m 20s) (20 6%) 5.1554
0m 21s (- 3m 9s) (30 10%) 3.4639
0m 34s (- 3m 39s) (40 13%) 3.7949
0m 46s (- 3m 46s) (50 17%) 3.7609
0m 56s (- 3m 39s) (60 20%) 3.3689
1m 4s (- 3m 27s) (70 23%) 3.8071
1m 13s (- 3m 15s) (80 27%) 3.9820
1m 20s (- 3m 2s) (90 30%) 3.4321
1m 27s (- 2m 49s) (100 34%) 3.4039
1m 34s (- 2m 38s) (110 37%) 3.8280
1m 41s (- 2m 27s) (120 40%) 3.4719
1m 49s (- 2m 17s) (130 44%) 3.0237
1m 56s (- 2m 7s) (140 47%) 3.4051
2m 2s (- 1m 57s) (150 51%) 3.2150
2m 9s (- 1m 48s) (160 54%) 3.6497
2m 17s (- 1m 40s) (170 57%) 3.2089
2m 24s (- 1m 31s) (180 61%) 3.4956
2m 30s (- 1m 22s) (190 64%) 2.8890
2m 38s (- 1m 14s) (200 68%) 3.1881
2m 47s (- 1m 7s) (210 71%) 3.2242
2m 55s (- 0m 59s) (220 74%) 3.3413
3m 2s (- 0m 50s) (230 78%) 3.0550
3m 14s (- 0m 43s) (240 81%) 3.3743
3m 23s (- 0m 35s) (250 85%) 3.0892
3m 31s (- 0m 27s) (260 88%) 3.3803
3m 40s (- 0m 19s) (270 91%) 2.9289
3m 47s (- 0m 11s) (280 95%) 2.6650
3m 55s (- 0m 3s) (290 98%) 3.0407
######

In [14]:
from transformations import sentence_from_tensor

def evaluate(encoder, decoder, bridge, input_tensor, max_length=args['MAX_LENGTH']):
    with torch.no_grad():
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.init_hidden()

        if args['bidirectional']:
            encoder_outputs = torch.zeros(args['batch_size'], max_length, 2 * encoder['hidden_size'], device=device)
            encoder_hidden_forward = encoder_hidden['forward']
            encoder_hidden_backward = encoder_hidden['backward']

            for ei in range(input_length):
                (encoder_hidden_forward, encoder_hidden_backward) = encoder(
                    (input_tensor[ei],input_tensor[input_length - 1 - ei]), (encoder_hidden_forward,encoder_hidden_backward))

            hn_forward, cn_forward = encoder_hidden_forward
            hn_backward, cn_backward = encoder_hidden_backward

            encoder_hn = torch.cat((hn_forward, hn_backward), 2)
            encoder_cn = torch.cat((cn_forward, cn_backward), 2)

            encoder_hn_last_layer = encoder_hn[-1].view(1, 1, -1)
            encoder_cn_last_layer = encoder_cn[-1].view(1,1,-1)

            encoder_hidden_last = [encoder_hn_last_layer, encoder_cn_last_layer]

        else:
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(
                    input_tensor[ei], encoder_hidden)

            hn, cn = encoder_hidden
            encoder_hn_last_layer = hn[-1].view(1,1,-1)
            encoder_cn_last_layer = cn[-1].view(1,1,-1)
            encoder_hidden_last = [encoder_hn_last_layer, encoder_cn_last_layer]

        decoder_input = torch.tensor([SOS_token], device=device)
        encoder_hidden_last = [bridge(item) for item in encoder_hidden_last]
        decoder_hidden = encoder_hidden_last

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

def evaluate_randomly(encoder, decoder, bridge, n=10):
    for i in range(n):
        pair = testset[i]['sentence']
        input_tensor, mask_input = reformat_tensor_mask(pair[:,0,:].view(1,1,-1))
        input_tensor = input_tensor[input_tensor != 0]
        output_tensor, mask_output = reformat_tensor_mask(pair[:,1,:].view(1,1,-1))
        output_tensor = output_tensor[output_tensor != 0]
        if device == torch.device("cuda"):
            input_tensor = input_tensor.cuda()
            output_tensor = output_tensor.cuda()

        input_sentence = ' '.join(sentence_from_tensor(input_lang, input_tensor))
        output_sentence = ' '.join(sentence_from_tensor(output_lang, output_tensor))
        print('Input: ', input_sentence)
        print('Output: ', output_sentence)
        output_words = evaluate(encoder, decoder, bridge, input_tensor)
        output_sentence = ' '.join(output_words)
        print('Predicted Output: ', output_sentence)
        print('')

evaluate_randomly(encoder1, decoder1, bridge)

Input:  i m strong  EOS
Output:  i m strong  EOS
Predicted Output:  i m strong  <EOS>

Input:  i m a patient  EOS
Output:  i m a patient  EOS
Predicted Output:  i m a patient  <EOS>

Input:  you re a wonderful guy  EOS
Output:  you re a wonderful guy  EOS
Predicted Output:  you re a wonderful guy  <EOS>

Input:  she is a doctor  EOS
Output:  she is a doctor  EOS
Predicted Output:  she is a doctor  <EOS>

Input:  i m not eating this  EOS
Output:  i m not eating this  EOS
Predicted Output:  i m not eating this  <EOS>

Input:  he is totally dependent on his parents  EOS
Output:  he is totally dependent on his parents  EOS
Predicted Output:  he is always on on his parents  <EOS>

Input:  you re lucky that you have a job  EOS
Output:  you re lucky that you have a job  EOS
Predicted Output:  you re lucky that you have a job  <EOS>

Input:  you re very lonely  EOS
Output:  you re very lonely  EOS
Predicted Output:  you re very lonely  <EOS>

Input:  i m about ready  EOS
Output:  i m about rea