# Purpose and dataset
This page is mainly for model development. I use a dataset created by Udacity for translating EN -> FR. 120k samples altogether with merely 300-400 unique vocabs per language. A working model should be able to get at least 95% very quickly. With the current minimal configuration, this model achieves around 97.5% validation accuracy.

The following code is a basic implementation of https://arxiv.org/abs/1610.03017 and we will not be using any tokenizer. 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data
import torch.nn.functional as F
import time
from pad1d import pad1d
import math
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pickle

cudafloat = torch.cuda.FloatTensor
cudalong = torch.cuda.LongTensor

import os

In [None]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r", encoding='utf-8') as f:
        data = f.read()

    return data.split('\n')

In [None]:
def build_vocab2(file1, src):
    # tweaked version of: https://github.com/nyu-dl/dl4mt-c2c/blob/master/preprocess/build_dictionary_char.py
    # TODO: figure out why the original function omit characters like Ã.
    
    word_dict = {}
    master_set = set('0')
    for sample in file1:
        set_letter = set(sample)
        master_set = master_set.union(set_letter)
    
    if src:
    # 0 -> ZERO
    # 1 -> UNK
    # 2 -> SOS
    # 3 -> EOS
        tokens = "ZERO UNK SOS EOS".split()
    else:
        tokens = "EOS UNK".split()

    for ii, aa in enumerate(tokens):
        word_dict[aa] = ii

    for ii, ww in enumerate(master_set):
        word_dict[ww] = ii + len(tokens)
        
    return word_dict

In [None]:
english_sentences = load_data('data/small_vocab_en')
inp_dict = build_vocab2(english_sentences, True)

french_sentences = load_data('data/small_vocab_fr')
tgt_dict = build_vocab2(french_sentences, False)

train_sz = int(len(english_sentences)*0.9)

train_data = english_sentences[:train_sz]
train_target = french_sentences[:train_sz]

val_data = english_sentences[train_sz:]
val_target = french_sentences[train_sz:]

inp_sz = len(inp_dict)
out_sz = len(tgt_dict)

In [None]:
print(len(train_target))

In [None]:
def seq_len_finder(data1, data2):
    longest_sent = 0
    for sentence in data1:
        curr_len = len(sentence)
        if curr_len > longest_sent:
            longest_sent = curr_len
    for sentence2 in data2:
        curr_len = len(sentence2)
        if curr_len > longest_sent:
            longest_sent = curr_len
    return longest_sent

In [None]:
# Ordering data by length decreases training time (since we vectorize only to max_len of the batch), but really hurts performance.
# There seems to be some serious lashback to not mixing up data. 
# This is in contrast to how humans learn though, we start from something easier and shorter first, then build on. 


# def sort_data_len(data, target): 
#     len_order = []
#     for i in range(len(data)):
#         len_data = len(data[i])
#         len_target = len(target[i])
#         max_len = max((len_data, len_target))
#         len_order.append(max_len)

#     # simple version
#     three_list = sorted(zip(len_order, data, target))

#     train_data = [x for l,x,y in three_list]
#     train_target = [y for l,x,y in three_list]
    
#     return train_data, train_target

In [None]:
z = 2000
print(train_data[z])
print(train_target[z])

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# def preprocess(x, y, length):
#     x_padded = pad_sequences(x, maxlen=length, padding='post')
#     y_padded = pad_sequences(y, maxlen=length, padding='post')
#     return x_padded, y_padded

In [None]:
def pad1d(tensor, pad, permute_dims=True):
    # source: https://github.com/pytorch/pytorch/issues/2637
    # tensor should be in shape (batch, time, feat)
    # pad should be in shape (left, right)
    if permute_dims:
        tensor = tensor.permute(0, 2, 1).contiguous() # get features on first dim since we are padding time
    else:
        tenosr = tensor.contiguous()
    original_size = tensor.size() # (batch, feat, time)
    final_new_size = (original_size[0], original_size[1], original_size[2] + pad[0] + pad[1])
    temp_new_size = original_size[:2] + (1,) + original_size[2:]
    assert len(temp_new_size) == 4
    tensor = tensor.view(*temp_new_size)
    pad = pad + (0, 0)
    tensor = F.pad(tensor, pad)
    tensor = tensor.view(*final_new_size)
    if permute_dims:
        tensor = tensor.permute(0, 2, 1)
    return tensor

def train_vectorize(x, y):
    seq_len = seq_len_finder(x, y)
#     x, y = preprocess(x, y, seq_len)
    Xtensor = torch.zeros(len(x), seq_len+1).long()
    ytensor = torch.zeros(len(x), seq_len+1).long()
    for i, seq in enumerate(x):
        for t, char in enumerate(seq):
            Xtensor[i, t] = inp_dict[seq[t]]
        Xtensor[i, len(seq)] = inp_dict['EOS']
    for i_y, seq_y in enumerate(y):
        for t_y, char_y in enumerate(seq_y):
            ytensor[i_y, t_y] = tgt_dict[seq_y[t_y]]
    return Xtensor, ytensor, seq_len

def repackage_hidden(h):
    # Frees up variable from old graph. Variables
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

### Model

In [None]:

class Encoder(nn.Module):
    def __init__(self, embed_dim, N, dropout, k_num, k_size, poolstride,
                 en_bi, en_layers, en_H,
                 num_embed=inp_sz):
        super(Encoder, self).__init__()
        self.Ci = embed_dim
        self.k_num = k_num  # channel-out (number of filters)
        self.ks = zip(k_num, k_size)
        self.k_sum = sum(k_num)
        self.poolstride = poolstride
        self.bi = 2 if en_bi == True else 1
        self.en_H = en_H
        self.N = N

        self.embed = nn.Embedding(num_embed, embed_dim)

        self.convks = nn.ModuleList()
        for (num, size) in self.ks:
            # half convolution padding with two sided W-1 to get same input and output length
            self.convks.append(nn.Conv1d(in_channels=self.Ci,
                                         out_channels=num,
                                         kernel_size=size,
                                         stride=1))
        self.biGRU = nn.LSTM(input_size=self.k_sum,
                            hidden_size=en_H,
                            num_layers=en_layers,
                            dropout=dropout,
                            bidirectional=en_bi)
        self.gate = nn.Linear(self.k_sum, self.k_sum)
        self.highway1 = nn.Linear(self.k_sum, self.k_sum)
        self.dropout = nn.Dropout(dropout)
        self.logsoftmax = nn.LogSoftmax()

    def init_hidden(self):
        h0 = Variable(torch.zeros(en_layers * self.bi, self.N, en_H).type(cudafloat))
        c0 = Variable(torch.zeros(en_layers * self.bi, self.N, en_H).type(cudafloat))
        return h0, c0

    def pad_conv_and_pool(self, x, conv):
        # padding for half convolution (aka 'same' padding), which needs asymetric padding
        # asymmetric padding assumes front pad is longer. k_size=4 would have 2 zeros padded front and 1 zero padded back
        # pad1d takes (N,W,D)
        k_size = conv.kernel_size[0]
        if k_size > 1:
            total_pad = k_size - 1
            pad_front = math.ceil(total_pad / 2)
            pad_back = total_pad - pad_front
            x = pad1d(x, (pad_front, pad_back))
        x = F.relu(conv(x.transpose(1, 2)))  # (N,W,Ci) => (N,Co,W)
        result = F.max_pool1d(x, kernel_size=self.poolstride)  # (N, Co, W/s)
        return result
    
    def highway(self, x):
        x.contiguous()
        gate = F.sigmoid(self.gate(x.view(-1, self.k_sum)))
        high1 = gate * F.relu(self.highway1(x.view(-1, self.k_sum)))
        high2 = (1-gate)*x
        result = high1 + high2
        return result

    def forward(self, input, hidden):
        # input - (N,W)
        x = self.embed(input)  # (N,W,D)
        x = [self.pad_conv_and_pool(x, convk) for convk in self.convks]  # (N,Ci,W) => (N,Co,W/s)
        x = torch.cat(x, dim=1)  # (N, sum(Co) for all k_width, W/s)
        x = x.permute(2, 0, 1)  # (W/s,N,D=k_sum) prep for rnn
        x = self.highway(x)
        output, hidden = self.biGRU(x.view(-1, self.N, self.k_sum), hidden)  # (W/s,N,H*bi)
        
        return output, hidden

### Luong Attention (process new input as well)

In [None]:

class LuongDecoder(nn.Module):
    def __init__(self, de_H, en_Hbi, de_layers, dropout, de_bi, N, de_embed, out_sz=out_sz):
        super(LuongDecoder, self).__init__()
        self.bi = 2 if de_bi == True else 1
        self.de_H = de_H
        self.en_Hbi = en_Hbi
        self.de_Hbi = self.bi * self.de_H
        self.de_layers = de_layers
        self.embed_sz = de_embed
        self.N = N
        self.out_sz = out_sz

        self.embedding = nn.Embedding(out_sz, de_embed)
        self.gru = nn.LSTM(input_size=de_embed,
                          hidden_size=de_H,
                          num_layers=de_layers,
                          dropout=dropout,
                          bidirectional=de_bi)

        self.dropout = nn.Dropout(dropout)
        self.logsoftmax = nn.LogSoftmax()

        # attention (Luong)
        # TODO: revise, won't work for all cases
        self.score_lin = nn.Linear(self.de_Hbi, self.de_Hbi * self.de_layers)
        self.lin_comb = nn.Linear(self.de_Hbi + self.en_Hbi, self.de_Hbi)
        self.lin_out = nn.Linear(self.de_Hbi, self.out_sz)

    def init_hidden(self):
        h0 = Variable(torch.zeros(self.de_layers * self.bi, N, self.de_H).type(cudafloat))
        h=c0 = Variable(torch.zeros(self.de_layers * self.bi, N, self.de_H).type(cudafloat))
        return h0, c0

    def forward(self, inputs, encoder_out, hidden):
        W_s, _, _ = encoder_out.size()

        # decoder RNN's output
        embed = self.embedding(inputs.transpose(0, 1))  # (N,W) => (W,N,D)
        W_t, _, _ = embed.size()

        # print("rnn input")
        # print(embed.size(), hidden.size())
        rnn_output, hidden = self.gru(embed, hidden)  # (W,N,D) => output (W,N,H*bi), hidden (layer*bi, N, H)

        rnn_output = rnn_output.transpose(0, 1)  # (N,W,H)
        rnn_output.contiguous()  # makes a contiguous copy for view

        # source hidden state
        # tensor containing the output features (h_s) from the last layer of the encoder RNN
        encoder_out = encoder_out.transpose(0, 1)  # (W,N,H) => (N,W,H)
        encoder_out.contiguous()  # (N,W,H)

        ### Luong's attn output & score

        # linear on RNN output
        h_t = self.score_lin(rnn_output.view(-1, self.de_Hbi))  # (N*W,H) dot (H,H)
        h_t = h_t.view(self.N, -1, self.de_Hbi)  # (N*W,H) => (N,W,H)
        h_s = encoder_out.permute(0, 2, 1)  # (N,W,H) => (N,H,W)

        # Matrix multiply between RNN output and Encoder's output
        scores = torch.bmm(h_t, h_s)  # (N,W_t,H) dot (N,H,W_s) => (N, W_t, W_s)

        # Normalize with softmax
        align_vec = F.softmax(scores.view(-1, W_s))  # softmax(N*W_t, Ws)
        align_vec = align_vec.view(self.N, -1, W_s)  # (N,W_t,W_s)

        # context_vec as weighted avg. of source states, based on attn weights
        context_vec = torch.bmm(align_vec, encoder_out)  # (N,W_t,W_s) dot (N,W_s,H_en) => (N,W_t,H_en)
        concat_vec = torch.cat((context_vec, rnn_output), dim=2)  # (N,W_t,H_cat)
        concat_vec = concat_vec.view(-1, concat_vec.size()[2])  # (N,W_t,H_cat) => (N*W_t, H_cat)

        # linear, tanh
        attn_h = self.lin_comb(concat_vec)  # (N*W_t, H*2) => (N*W_t,H)
        attn_h = attn_h.view(self.N, W_t, self.de_Hbi)  # (N,W_t,H)
        attn_h = F.tanh(attn_h.transpose(0, 1))  # (W_t,N,H)

        # Linear, softmax
        # output = self.dropout(attn_h)
        output = attn_h
        output = self.lin_out(output.view(-1, self.de_Hbi))  # (W,N,H) => (W*N,H)
        output = self.logsoftmax(output)  # (W*N,H) => (W*N,Out)
        output = output.view(W_t, self.N, self.out_sz)  # (W*N,Out) => (W,N,Out)

        return output, hidden, align_vec.transpose(0, 1)

In [None]:
learning_rate = 1e-4
dropout = 0.2

''' Encoder config '''
embed_dim = 256
N = 128
poolstride = 5
en_bi = True
en_layers = 1
en_H = 256
k_num = [200, 200, 250, 250, 300, 300, 300, 300]
k_size = [1, 2, 3, 4, 5, 6, 7, 8]

''' Decoder config '''
de_embed = 256
de_H = 256
de_layers = 1
de_bi = True
en_Hbi = en_H * (2 if en_bi == True else 1)


In [None]:
encoder = Encoder(embed_dim=embed_dim, N=N, dropout=dropout, k_num=k_num, k_size=k_size, poolstride=poolstride,
                      en_bi=en_bi, en_layers=en_layers, en_H=en_H)
decoder = LuongDecoder(de_embed=de_embed, de_H=de_H, en_Hbi=en_Hbi, de_layers=de_layers, dropout=dropout, de_bi=de_bi, N=N)
decoder.cuda()
encoder.cuda()
en_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
de_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(size_average=True)

In [None]:
graph_train, graph_val = [], []
best_val_loss = 100.0
best_val_acc = 0.0
n_epochs = 40
train_remainder = len(train_data) % N
val_remainder = len(val_data) % N

for epoch in range(n_epochs):
    start_time = time.time()
    train_loss, train_acc = 0.0, 0.0
    val_loss, val_acc = 0.0, 0.0
    correct = 0
    total_loss = 0
    total_val_len = 0
    en_hidden = encoder.init_hidden()
    de_hidden = decoder.init_hidden()

    encoder.train()
    decoder.train()
    for batch in range(0, len(train_data) - train_remainder,N):
        loss = 0
        data_raw = train_data[batch:batch+N]
        target_raw = train_target[batch:batch+N]
        data, target, max_len = train_vectorize(data_raw, target_raw)
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        en_hidden = repackage_hidden(en_hidden)
        de_hidden = repackage_hidden(de_hidden)

        # forward, backward, optimize
        en_out, en_hidden = encoder(data, en_hidden)
        de_hidden = en_hidden
        de_in = Variable(torch.zeros(decoder.N, 1).type(cudalong))
        
        for di in range(max_len):
            de_out, de_hidden, attn = decoder(de_in, en_out, de_hidden)  # (W=1,N,Out)
            de_out = de_out.squeeze(0)  # (N, Out)
            target_T = target.transpose(0, 1)  # (N,W) => (W, N)
            loss += criterion(de_out, target_T[di])  # (N,Out) and (N)
            
            train_loss += loss.data[0]/max_len
            de_in = target_T[di].unsqueeze(1)

        en_optimizer.zero_grad()
        de_optimizer.zero_grad()
        loss.backward(retain_variables=True)
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 5)
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 5)
        en_optimizer.step()
        de_optimizer.step()
        
        if batch%(N*100) == 0:
            print ("update", batch/N, time.time() - start_time)

    # evaluate with validation set
    encoder.eval()
    decoder.eval()
    for batch in range(0, len(val_data) - val_remainder, N):
        data_raw = val_data[batch:batch + N]
        target_raw = val_target[batch:batch + N]
        data, target, max_len = train_vectorize(data_raw, target_raw)
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target, volatile=True)
        en_hidden = repackage_hidden(en_hidden)
        de_hidden = repackage_hidden(de_hidden)

        en_out, en_hidden = encoder(data, en_hidden)
        de_hidden = en_hidden
        de_in = Variable(torch.zeros(decoder.N, 1).type(cudalong))

        for di in range(max_len):
            de_out, de_hidden, attn = decoder(de_in, en_out, de_hidden)  # (W=1,N,Out)
            de_out = de_out.squeeze(0)  # (N, Out)
            target_T = target.transpose(0, 1)  # (N,W) => (W, N)
            loss = criterion(de_out, target_T[di])  # (N,Out) and (N)
            val_loss += loss.data[0]/(max_len)

            de_in = Variable(de_out.data.max(1)[1].type(cudalong), volatile=True)

            pred = de_out.data.max(1)[1].squeeze().contiguous()  # get the index of the max log-probability
            target_pred = target_T[di].contiguous()
            correct += pred.eq(target_pred.data.view_as(pred)).cpu().sum()
            
        total_val_len += max_len
        
    train_loss /= len(train_data)
    val_loss /= len(val_data)

    graph_train.append(train_loss)
    graph_val.append(val_loss)

    print('[%d] train loss: %.3f val loss: %.4f acc: %.3f time: %.3f' % \
          (epoch + 1, train_loss, val_loss, correct / (total_val_len*N),
           time.time() - start_time))
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(encoder.state_dict(), 'best_encoder_weight_en-fr')
        torch.save(decoder.state_dict(), 'best_decoder_weight_en-fr')
        
        print('saving least val loss model from epoch [%d]'% (epoch+1))
        print(val_data[0])

In [None]:
plt.ylabel('Avg. loss')
plt.xlabel('Epoch')
plt.plot(graph_train)
plt.plot(graph_val)
plt.grid(True)
plt.show()

### Trying out translation
Here, I reinitialize the model with batch size of 1 and load the weights. You can try different english output to see what the model translates as. 

In [None]:
import sys
def toy_vectorize2(x):
    seq_len = len(x)
    Xtensor = torch.zeros(1, seq_len+1).long()
    for t, char in enumerate(x):
        Xtensor[0, t] = inp_dict[x[t]]
    Xtensor[0, seq_len] = inp_dict['EOS']
    return Xtensor

tgt_dict_i2c = {v:k for k, v in tgt_dict.items()}
N_toy = 1
correct_toy = 0
toy_data = val_data[3]
print("input :", toy_data)
print("output :")

encoder_toy = Encoder(embed_dim=embed_dim, N=N_toy, dropout=dropout, k_num=k_num, k_size=k_size, poolstride=poolstride,
                      en_bi=en_bi, en_layers=en_layers, en_H=en_H)
decoder_toy = LuongDecoder(de_embed=de_embed, de_H=de_H, en_Hbi=en_Hbi, de_layers=de_layers, dropout=dropout, de_bi=de_bi, N=N_toy)

encoder_toy.load_state_dict(torch.load('best_encoder_weight_en-fr'))
decoder_toy.load_state_dict(torch.load('best_decoder_weight_en-fr'))

encoder_toy.cuda()
decoder_toy.cuda()
encoder_toy.eval()
decoder_toy.eval()

en_hidden_toy, de_hidden_toy = encoder_toy.init_hidden(), decoder_toy.init_hidden()
toy_data = toy_vectorize2(toy_data)
toy_data = Variable(toy_data.cuda())

toy_out, en_hidden_toy = encoder_toy(toy_data, en_hidden_toy)
de_hidden_toy = en_hidden_toy

de_in_toy = Variable(torch.zeros(decoder_toy.N, 1).type(cudalong))
for di in range(100):
    de_out, de_hidden_toy, attn = decoder_toy(de_in_toy, toy_out, de_hidden_toy)  # (W=1,N,Out)
    de_out = de_out.squeeze(0)  # (N, Out)
    
    de_in_toy = Variable(de_out.data.max(1)[1].type(cudalong), volatile=True)
    
    output_dist = de_out.view(-1).exp().cpu()
    top_i = torch.multinomial(output_dist, 1)[0]
    pred_data = tgt_dict_i2c[top_i.data[0]]
    
    sys.stdout.write(pred_data)