# English to Indonesian attention based translation on simple dataset

References: 
1. https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html       

In [84]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_cuda = torch.cuda.is_available()

In [85]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

fp = open('./corpus/eng-indo.txt', 'r')
text = fp.read()
text = text.splitlines()
fp.close()
text[:5]

['Run!\tLari!',
 'Who?\tSiapa?',
 'Wow!\tWow!',
 'Help!\tTolong!',
 'Jump!\tLompat!']

In [86]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


text_dict = {"English": [], "Indonesian": []}
for l in text:
    split_text = l.split("\t")
    text_dict["English"].append(normalizeString(split_text[0]))
    text_dict["Indonesian"].append(normalizeString(split_text[1]))
    
df = pd.DataFrame.from_dict(text_dict)
print(df.shape)
df.head()

(6752, 2)


Unnamed: 0,English,Indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !


In [87]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

In [91]:
MAX_LENGTH = 15

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re ",
    "tom is", "tom s",
    "what s", "what a",
   "are you", "do you",
   "what is", "tom was",
   "don t", "it s", "where s",
   "where did", "where is",
)

def should_keep_row(row):
    """ Should the current row be kept as training set"""
    indo_num_words = len(word_tokenize(row["Indonesian"]))
    eng_num_words = len(word_tokenize(row["English"]))
    num_words_required = MAX_LENGTH - 2
    
    does_start_with_prefix = False
    for prefix in eng_prefixes:
        if row["English"].startswith(prefix):
            does_start_with_prefix = True
            break
    
    return (indo_num_words <= num_words_required) and (eng_num_words <= num_words_required) and does_start_with_prefix

df["keep_row"] = df.apply(should_keep_row, axis=1)
print(df.shape)
df.head()

(6563, 4)


Unnamed: 0,index,English,Indonesian,keep_row
0,0,run !,lari !,False
1,1,who ?,siapa ?,False
2,2,wow !,wow !,False
3,3,help !,tolong !,False
4,4,jump !,lompat !,False


In [92]:
print("Current shape: " + str(df.shape))
df = df[df["keep_row"]]
print("New shape: " + str(df.shape))
df.head()
df = df.reset_index().drop(columns=["keep_row"])
df.head()

Current shape: (6563, 4)
New shape: (1319, 4)


Unnamed: 0,level_0,index,English,Indonesian
0,34,34,i m sad .,saya sedih .
1,35,35,it s me !,ini aku !
2,57,57,i m okay .,aku baik baik saja .
3,58,58,it s odd .,itu aneh .
4,59,59,it s odd .,ini aneh .


In [93]:
df.to_csv("./corpus/eng-indo-filtered.txt", index=False)

In [60]:
# Use a unique string to indicate START and END of a sentence.
# Assign a unique index to them.
START, START_IDX = '<s>',  0
END, END_IDX = '</s>', 1
UNK, UNK_IDX = 'UNK', 2

# We use this idiom to tokenize our sentences in the dataframe column:
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize)

# Also we added the START and the END symbol to the sentences. 
english_sents = [START] + df['English'].apply(str.lower).apply(word_tokenize) + [END]
indo_sents = [START] + df['Indonesian'].apply(str.lower).apply(word_tokenize) + [END]

# We're sort of getting into the data into the shape we want. 
# But now it's still too humanly readable and redundant.
## Cut-away: Computers like it to be simpler, more concise. -_-|||
print('First English sentence:', english_sents[0])
print('First Indo sentence:', indo_sents[0])

First English sentence: ['<s>', 'run', '!', '</s>']
First Indo sentence: ['<s>', 'lari', '!', '</s>']


In [61]:
english_vocab = Dictionary([['<s>'], ['</s>'],['UNK']])
english_vocab.add_documents(english_sents)

indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
indo_vocab.add_documents(indo_sents)

# First ten words in the vocabulary.
print('First 10 Indonesian words in Dictionary:\n', sorted(indo_vocab.items())[:10])
print()
print('First 10 English words in Dictionary:\n', sorted(english_vocab.items())[:10])

import pickle
# Lets save our dictionaries.
#with open('./vocabs/simple_indo_vocab.Dictionary.pkl', 'wb') as fout:
#    pickle.dump(indo_vocab, fout)
    
#with open('./vocabs/simple_english_vocab.Dictionary.pkl', 'wb') as fout:
#    pickle.dump(english_vocab, fout)

First 10 Indonesian words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, '!'), (4, 'lari'), (5, '?'), (6, 'siapa'), (7, 'wow'), (8, 'tolong'), (9, 'lompat')]

First 10 English words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, '!'), (4, 'run'), (5, '?'), (6, 'who'), (7, 'wow'), (8, 'help'), (9, 'jump')]


In [62]:
# Vectorizes a sentence with a given vocab
def vectorize_sent(sent, vocab):
    return vocab.doc2idx([START] + word_tokenize(sent.lower()) + [END], unknown_word_index=2)

# Creates a PyTorch variable from a sentence against a given vocab
def variable_from_sent(sent, vocab):
    vsent = vectorize_sent(sent, vocab)
    #print(vsent)
    result = Variable(torch.LongTensor(vsent).view(-1, 1))
    #print(result)
    return result.cuda() if use_cuda else result

# Test
new_kopi = "Is it love?"
variable_from_sent(new_kopi, english_vocab)

tensor([[  0],
        [111],
        [ 23],
        [130],
        [  5],
        [  1]])

In [63]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.15)
print(df_train.shape)
print(df_val.shape)

(5554, 3)
(981, 3)


In [64]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_train.head()

Unnamed: 0,index,English,Indonesian
0,5869,Tom retired from the army three years ago.,Tom pensiun dari dinas tentara tiga tahun yang...
1,5817,He was spreading mulch in the flower beds.,Dia menyebar mulsa di ladang bunga.
2,6333,What was your mother doing when you returned h...,Apa yang dilakukan ibumu saat kamu kembali ke ...
3,2673,Tom came by this morning.,Tom datang pagi ini.
4,3393,How much did you guys spend?,Berapa banyak yang kalian habiskan?


# Prepare the training and the validation datasets

In [65]:
indo_tensors = df_train['Indonesian'].apply(lambda s: variable_from_sent(s, indo_vocab))
print(df_train.iloc[0]['Indonesian'])


Tom pensiun dari dinas tentara tiga tahun yang lalu.


In [66]:
english_tensors = df_train['English'].apply(lambda s: variable_from_sent(s, english_vocab))
print(df_train.iloc[0]['English'])
print(english_tensors[0])
# Now, each item in `sent_pairs` is our data point. 
sent_pairs = list(zip(english_tensors, indo_tensors))

Tom retired from the army three years ago.
tensor([[   0],
        [  50],
        [ 971],
        [ 413],
        [ 269],
        [2586],
        [ 742],
        [ 537],
        [1064],
        [  10],
        [   1]])


## Define the model

In [67]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [68]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)