# English to Indonesian attention based translation

References: 
1. https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html       

In [16]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_cuda = torch.cuda.is_available()

In [35]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

fp = open('./corpus/SMERU-26870.en', 'r')
#fp = open('./corpus/SMERU-26870.en', 'r')
eng_text = fp.read()
eng_text = eng_text.splitlines()
fp.close()

fp2 = open('./corpus/SMERU-26870.id', 'r')
#fp2 = open('./corpus/SMERU-26870.id', 'r')
id_text = fp2.read()
id_text = id_text.splitlines()
fp2.close()

In [36]:
df_eng_text = pd.DataFrame(eng_text)
df_eng_text = df_eng_text.rename(columns={0:'English'})

df_id_text = pd.DataFrame(id_text)
df_id_text = df_id_text.rename(columns={0:'Indonesian'})

In [37]:
df_eng_text['English'] = df_eng_text['English'].apply(lambda x : x.strip())
df_id_text['Indonesian'] = df_id_text['Indonesian'].apply(lambda x : x.strip())

import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

df = pd.concat([df_eng_text, df_id_text], axis=1)
df.head()

Unnamed: 0,English,Indonesian
0,ACKNOWLEDGEMENTS This report of Access and Equ...,UCAPAN TERIMA KASIH Laporan mengenai Akses ter...
1,We would like to express our genuine appreciat...,Kami ingin menyampaikan apresiasi yang sebesar...
2,We are truly grateful to the Family Court of A...,Kami sangat berterima kasih kepada Family Cour...
3,We would also like to express our sincere grat...,Kami juga ingin menyampaikan rasa terima kasih...
4,We especially appreciate the support and accep...,Kami berterima kasih secara khusus atas dukung...


In [38]:
# keeping to 20 to heavily restrict the scope for now to be improved as we progress
MAX_LEN = 25

def check_sentence_len(row):
    indo_num_words = len(word_tokenize(row["Indonesian"]))
    eng_num_words = len(word_tokenize(row["English"]))
    num_words_required = MAX_LEN - 2
    return (indo_num_words <= num_words_required) and (eng_num_words <= num_words_required)

#df["Indo_num_words"] = df["Indonesian"].apply(str.lower).apply(word_tokenize).apply(len)
#df["Eng_num_words"] = df["English"].apply(str.lower).apply(word_tokenize).apply(len)
df["keep_column"] = df.apply(check_sentence_len, axis=1)
df.head()

Unnamed: 0,English,Indonesian,keep_column
0,ACKNOWLEDGEMENTS This report of Access and Equ...,UCAPAN TERIMA KASIH Laporan mengenai Akses ter...,False
1,We would like to express our genuine appreciat...,Kami ingin menyampaikan apresiasi yang sebesar...,False
2,We are truly grateful to the Family Court of A...,Kami sangat berterima kasih kepada Family Cour...,True
3,We would also like to express our sincere grat...,Kami juga ingin menyampaikan rasa terima kasih...,False
4,We especially appreciate the support and accep...,Kami berterima kasih secara khusus atas dukung...,True


In [39]:
print("Current shape: " + str(df.shape))
df = df[df["keep_column"]]
print("New shape: " + str(df.shape))
df.head()
df = df.reset_index().drop(columns=["keep_column"])
df.head()

Current shape: (26966, 3)
New shape: (14003, 3)


Unnamed: 0,index,English,Indonesian
0,2,We are truly grateful to the Family Court of A...,Kami sangat berterima kasih kepada Family Cour...
1,4,We especially appreciate the support and accep...,Kami berterima kasih secara khusus atas dukung...
2,6,SMERU would also like to thank the Australian ...,SMERU juga berterima kasih kepada Pemerintah A...
3,7,ABSTRACT,ABSTRAK
4,8,ACCESS TO JUSTICE,AKSES TERHADAP KEADILAN


In [11]:
# Use a unique string to indicate START and END of a sentence.
# Assign a unique index to them.
START, START_IDX = '<s>',  0
END, END_IDX = '</s>', 1
UNK, UNK_IDX = 'UNK', 2

# We use this idiom to tokenize our sentences in the dataframe column:
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize)

# Also we added the START and the END symbol to the sentences. 
english_sents = [START] + df['English'].apply(str.lower).apply(word_tokenize) + [END]
indo_sents = [START] + df['Indonesian'].apply(str.lower).apply(word_tokenize) + [END]

# We're sort of getting into the data into the shape we want. 
# But now it's still too humanly readable and redundant.
## Cut-away: Computers like it to be simpler, more concise. -_-|||
print('First English sentence:', english_sents[0])
print('First Indo sentence:', indo_sents[0])

First English sentence: ['<s>', 'acknowledgements', 'this', 'report', 'of', 'access', 'and', 'equity', 'survey', 'in', 'family', 'law', 'and', 'civil', 'status', 'issues', 'for', 'the', 'courts', 'in', 'indonesia', 'could', 'only', 'be', 'finished', 'with', 'the', 'support', 'and', 'cooperation', 'of', 'a', 'number', 'of', 'people', '</s>']
First Indo sentence: ['<s>', 'ucapan', 'terima', 'kasih', 'laporan', 'mengenai', 'akses', 'terhadap', 'keadilan', 'pemberdayaan', 'perempuan', 'kepala', 'keluarga', 'di', 'indonesia', 'ini', 'hanya', 'dapat', 'terselesaikan', 'berkat', 'dukungan', 'dan', 'kerja', 'sama', 'dari', 'seluruh', 'pihak', 'yang', 'terlibat', '</s>']


In [12]:
english_vocab = Dictionary([['<s>'], ['</s>'],['UNK']])
english_vocab.add_documents(english_sents)

indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
indo_vocab.add_documents(indo_sents)

# First ten words in the vocabulary.
print('First 10 Indonesian words in Dictionary:\n', sorted(indo_vocab.items())[:10])
print()
print('First 10 English words in Dictionary:\n', sorted(english_vocab.items())[:10])

import pickle
# Lets save our dictionaries.
with open('./vocabs/smeru_indo_vocab.Dictionary.pkl', 'wb') as fout:
    pickle.dump(indo_vocab, fout)
    
with open('./vocabs/smeru_english_vocab.Dictionary.pkl', 'wb') as fout:
    pickle.dump(english_vocab, fout)

First 10 Indonesian words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, 'akses'), (4, 'berkat'), (5, 'dan'), (6, 'dapat'), (7, 'dari'), (8, 'di'), (9, 'dukungan')]

First 10 English words in Dictionary:
 [(0, '<s>'), (1, '</s>'), (2, 'UNK'), (3, 'a'), (4, 'access'), (5, 'acknowledgements'), (6, 'and'), (7, 'be'), (8, 'civil'), (9, 'cooperation')]


In [17]:
# Vectorizes a sentence with a given vocab
def vectorize_sent(sent, vocab):
    return vocab.doc2idx([START] + word_tokenize(sent.lower()) + [END], unknown_word_index=2)

# Creates a PyTorch variable from a sentence against a given vocab
def variable_from_sent(sent, vocab):
    vsent = vectorize_sent(sent, vocab)
    #print(vsent)
    result = Variable(torch.LongTensor(vsent).view(-1, 1))
    #print(result)
    return result.cuda() if use_cuda else result

# Test
new_kopi = "French Muslims fined for face veils"
variable_from_sent(new_kopi, english_vocab)

tensor([[   0],
        [   2],
        [1971],
        [7058],
        [  15],
        [3522],
        [   2],
        [   1]])

In [18]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.15)
print(df_train.shape)
print(df_val.shape)

(22921, 2)
(4045, 2)


# Prepare the training and the validation datasets

In [19]:
# Prepare the whole training corpus.
indo_tensors = df_train['Indonesian'].apply(lambda s: variable_from_sent(s, indo_vocab))
print(df_train.iloc[0]['Indonesian'])
print(indo_tensors[0])
english_tensors = df_train['English'].apply(lambda s: variable_from_sent(s, english_vocab))
print(df_train.iloc[0]['English'])
print(english_tensors[0])
# Now, each item in `sent_pairs` is our data point. 
sent_pairs = list(zip(english_tensors, indo_tensors))

Adanya kesalahan sasaran (mistargeting) juga diungkapkan oleh responden rumah tangga karena terdapat penerima yang berasal dari keluarga relatif mampu dengan indikasi memiliki kendaraan roda dua
tensor([[ 0],
        [29],
        [26],
        [13],
        [18],
        [19],
        [ 3],
        [25],
        [14],
        [20],
        [21],
        [16],
        [15],
        [ 8],
        [11],
        [12],
        [10],
        [ 6],
        [28],
        [ 4],
        [ 9],
        [ 5],
        [17],
        [23],
        [ 7],
        [24],
        [22],
        [30],
        [27],
        [ 1]])
Household respondents also said that there was mistargeting because there have been instances of recipients coming from relatively well-off families who already own motorcycles
tensor([[ 0],
        [ 5],
        [29],
        [24],
        [21],
        [ 4],
        [ 6],
        [12],
        [27],
        [16],
        [13],
        [19],
        [ 6],
        [ 8],
        [25