# English to Indonesian attention based translation on simple dataset

References: 
1. https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html       

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_cuda = torch.cuda.is_available()

In [2]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

fp = open('./corpus/eng-indo.txt', 'r')
text = fp.read()
text = text.splitlines()
fp.close()
text[:5]

['Run!\tLari!',
 'Who?\tSiapa?',
 'Wow!\tWow!',
 'Help!\tTolong!',
 'Jump!\tLompat!']

In [3]:
text_dict = {"English": [], "Indonesian": []}
for l in text:
    split_text = l.split("\t")
    text_dict["English"].append(split_text[0])
    text_dict["Indonesian"].append(split_text[1])
    
df = pd.DataFrame.from_dict(text_dict)
print(df.shape)
df.head()

(6752, 2)


Unnamed: 0,English,Indonesian
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!


In [4]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

In [5]:
# keeping to 20 to heavily restrict the scope for now to be improved as we progress
MAX_LEN = 15

def check_sentence_len(row):
    indo_num_words = len(word_tokenize(row["Indonesian"]))
    eng_num_words = len(word_tokenize(row["English"]))
    num_words_required = MAX_LEN - 2
    return (indo_num_words <= num_words_required) and (eng_num_words <= num_words_required)

#df["Indo_num_words"] = df["Indonesian"].apply(str.lower).apply(word_tokenize).apply(len)
#df["Eng_num_words"] = df["English"].apply(str.lower).apply(word_tokenize).apply(len)
df["keep_column"] = df.apply(check_sentence_len, axis=1)
df.head()

Unnamed: 0,English,Indonesian,keep_column
0,Run!,Lari!,True
1,Who?,Siapa?,True
2,Wow!,Wow!,True
3,Help!,Tolong!,True
4,Jump!,Lompat!,True


In [6]:
print("Current shape: " + str(df.shape))
df = df[df["keep_column"]]
print("New shape: " + str(df.shape))
df.head()
df = df.reset_index().drop(columns=["keep_column"])
df.head()

Current shape: (6752, 3)
New shape: (6535, 3)


Unnamed: 0,index,English,Indonesian
0,0,Run!,Lari!
1,1,Who?,Siapa?
2,2,Wow!,Wow!
3,3,Help!,Tolong!
4,4,Jump!,Lompat!


In [7]:
# Use a unique string to indicate START and END of a sentence.
# Assign a unique index to them.
START, START_IDX = '<s>',  0
END, END_IDX = '</s>', 1
UNK, UNK_IDX = 'UNK', 2

# We use this idiom to tokenize our sentences in the dataframe column:
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize)

# Also we added the START and the END symbol to the sentences. 
english_sents = [START] + df['English'].apply(str.lower).apply(word_tokenize) + [END]
indo_sents = [START] + df['Indonesian'].apply(str.lower).apply(word_tokenize) + [END]

# We're sort of getting into the data into the shape we want. 
# But now it's still too humanly readable and redundant.
## Cut-away: Computers like it to be simpler, more concise. -_-|||
print('First English sentence:', english_sents[0])
print('First Indo sentence:', indo_sents[0])

First English sentence: ['<s>', 'run', '!', '</s>']
First Indo sentence: ['<s>', 'lari', '!', '</s>']


In [8]:
english_vocab = Dictionary([['<s>'], ['</s>'],['UNK']])
english_vocab.add_documents(english_sents)

indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
indo_vocab.add_documents(indo_sents)

# First ten words in the vocabulary.
print('First 10 Indonesian words in Dictionary:\n', sorted(indo_vocab.items())[:10])
print()
print('First 10 English words in Dictionary:\n', sorted(english_vocab.items())[:10])

import pickle
# Lets save our dictionaries.
with open('./vocabs/simple_indo_vocab.Dictionary.pkl', 'wb') as fout:
    pickle.dump(indo_vocab, fout)
    
with open('./vocabs/simple_english_vocab.Dictionary.pkl', 'wb') as fout:
    pickle.dump(english_vocab, fout)

First 10 Indonesian words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, '!'), (4, 'lari'), (5, '?'), (6, 'siapa'), (7, 'wow'), (8, 'tolong'), (9, 'lompat')]

First 10 English words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, '!'), (4, 'run'), (5, '?'), (6, 'who'), (7, 'wow'), (8, 'help'), (9, 'jump')]


In [9]:
# Vectorizes a sentence with a given vocab
def vectorize_sent(sent, vocab):
    return vocab.doc2idx([START] + word_tokenize(sent.lower()) + [END], unknown_word_index=2)

# Creates a PyTorch variable from a sentence against a given vocab
def variable_from_sent(sent, vocab):
    vsent = vectorize_sent(sent, vocab)
    #print(vsent)
    result = Variable(torch.LongTensor(vsent).view(-1, 1))
    #print(result)
    return result.cuda() if use_cuda else result

# Test
new_kopi = "Is it love?"
variable_from_sent(new_kopi, english_vocab)

tensor([[  0],
        [111],
        [ 23],
        [130],
        [  5],
        [  1]])

In [10]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.15)
print(df_train.shape)
print(df_val.shape)

(5554, 3)
(981, 3)


In [14]:
df_train.head()

Unnamed: 0,index,English,Indonesian
2653,2653,That wasn't there before.,Itu tidak ada di sana sebelumnya.
4128,4128,Do you like playing volleyball?,Apa kau suka bermain voli?
2881,2881,I'd rather die than leave.,Lebih baik aku mati daripada aku pergi.
4070,4070,Tom never brought anyone here.,Tom tidak pernah membawa siapa-siapa kemari.
3739,3739,It was dark under the bridge.,Di bawah jembatan gelap.


# Prepare the training and the validation datasets

In [11]:
# Prepare the whole training corpus.
indo_tensors = df_train['Indonesian'].apply(lambda s: variable_from_sent(s, indo_vocab))
indo_tensors.head()

2653    [[tensor(0)], [tensor(75)], [tensor(31)], [ten...
4128    [[tensor(0)], [tensor(83)], [tensor(158)], [te...
2881    [[tensor(0)], [tensor(496)], [tensor(153)], [t...
4070    [[tensor(0)], [tensor(57)], [tensor(31)], [ten...
3739    [[tensor(0)], [tensor(111)], [tensor(110)], [t...
Name: Indonesian, dtype: object

In [13]:
print(df_train.iloc[0]['Indonesian'])
print(indo_tensors[0])
english_tensors = df_train['English'].apply(lambda s: variable_from_sent(s, english_vocab))
print(df_train.iloc[0]['English'])
print(english_tensors[0])
# Now, each item in `sent_pairs` is our data point. 
sent_pairs = list(zip(english_tensors, indo_tensors))

Itu tidak ada di sana sebelumnya.
tensor([[0],
        [4],
        [3],
        [1]])
That wasn't there before.
tensor([[0],
        [4],
        [3],
        [1]])
