In [1]:
# We need to bootstrap the dataset with a simple rules:
#
# * Since we know that Tom and Mary is a very prominent words, 
#   we can reduce the importance of those words by replicating 
#   the sentence with different subject (Tom -> Harry, John, etc)

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

In [3]:
fp = open('./corpus/eng-indo.txt', 'r')

text = fp.read()
text = text.splitlines()
fp.close()

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

text_dict = {"English": [], "Indonesian": []}
for l in text:
    split_text = l.split("\t")
    text_dict["English"].append(normalizeString(split_text[0]))
    text_dict["Indonesian"].append(normalizeString(split_text[1]))

df = pd.DataFrame.from_dict(text_dict)
print(df.shape)
df.head()

(6752, 2)


Unnamed: 0,English,Indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !


In [4]:
names_dict = {"john":"john", "johnny":"johnny", "darren": "darren", "mommy":"ibu", "daddy": "ayah", "gary": "gary", "jack": "jack", "ken": "ken", "michael": "michael", "sarah": "sarah"}

def duplicate_sentences(dataframe, subject_name, names_dict):
    # replace tom with other subjects
    generated_dict = {"English": [], "Indonesian": []}
    filtered_df = dataframe[dataframe['English'].str.contains(subject_name)]

    for idx, row in filtered_df.iterrows():
        for key, value in names_dict.items():
            modified_english = row["English"].replace(subject_name, key)
            modified_indonesian = row["Indonesian"].replace(subject_name, value)
            generated_dict["English"].append(modified_english)
            generated_dict["Indonesian"].append(modified_indonesian)
    return generated_dict
    
generated_tom_dict = duplicate_sentences(df, "tom", names_dict)
tom_df = pd.DataFrame.from_dict(generated_tom_dict)

generated_mary_dict = duplicate_sentences(df, "mary", names_dict)
mary_df = pd.DataFrame.from_dict(generated_mary_dict)

In [5]:
tom_df.head()

Unnamed: 0,English,Indonesian
0,john won .,john menang .
1,johnny won .,johnny menang .
2,darren won .,darren menang .
3,mommy won .,ibu menang .
4,daddy won .,ayah menang .


In [6]:
mary_df.head()

Unnamed: 0,English,Indonesian
0,john is graceful .,john adalah seorang yang anggun .
1,johnny is graceful .,johnny adalah seorang yang anggun .
2,darren is graceful .,darren adalah seorang yang anggun .
3,mommy is graceful .,ibu adalah seorang yang anggun .
4,daddy is graceful .,ayah adalah seorang yang anggun .


In [7]:
merged_df = pd.concat([df, tom_df, mary_df])
merged_df.head(5)

Unnamed: 0,English,Indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !


In [8]:
import random

names_dict = {"john":"john", "johnny":"johnny", "darren": "darren", "mommy":"ibu", "daddy": "ayah", 
              "gary": "gary", "jack": "jack", "ken": "ken", "michael": "michael", "sarah": "sarah", 
              "billy": "billy", "maria": "maria", "lim": "lim", "david": "david", "mia": "mia",
              "nancy": "nancy", "vijay": "vijay", "carrie": "carrie", "lee": "lee", "kim": "kim"}

target_names = ["tom", "mary", "john", "johnny", "darren", "mommy", "daddy", "gary", 
                "jack", "ken", "michael", "sarah"]

def replace_subject(dataframe, subject_name, names_dict):
    # replace tom with other subjects
    generated_dict = {"English": [], "Indonesian": []}
    filtered_df = dataframe[dataframe['English'].str.contains(subject_name)]

    for idx, row in filtered_df.iterrows():
        new_subject = random.choice(list(names_dict.keys()))
        new_subject_translated = names_dict[new_subject]
        modified_english = row["English"].replace(subject_name, new_subject)
        modified_indonesian = row["Indonesian"].replace(subject_name, new_subject_translated)
        generated_dict["English"].append(modified_english)
        generated_dict["Indonesian"].append(modified_indonesian)
    return generated_dict

generated_dict = {"English": [], "Indonesian": []}
for name in target_names:
    new_dict = replace_subject(merged_df, name, names_dict)
    generated_dict = {**generated_dict, **new_dict}

final_df = pd.DataFrame.from_dict(generated_dict)
print(final_df.shape)
final_df.head()

(1532, 2)


Unnamed: 0,English,Indonesian
0,sarah won .,sarah menang .
1,grab gary .,tangkap gary .
2,i hit johnny .,aku memukul johnny .
3,kim cried .,kim menangis .
4,who s lee ?,lee itu siapa ?


In [9]:
fp = open('./corpus/eng-indo-augmented.txt', 'w')

for idx, row in final_df.iterrows():
    fp.write(row["English"] + "\t" + row["Indonesian"] + "\n")