In [1]:
# We need to bootstrap the dataset with a simple rules:
#
# * Since we know that Tom and Mary is a very prominent words, 
#   we can reduce the importance of those words by replicating 
#   the sentence with different subject (Tom -> Harry, John, etc)

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

In [3]:
fp = open('./corpus/eng-indo.txt', 'r')

text = fp.read()
text = text.splitlines()
fp.close()

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

text_dict = {"English": [], "Indonesian": []}
for l in text:
    split_text = l.split("\t")
    text_dict["English"].append(normalizeString(split_text[0]))
    text_dict["Indonesian"].append(normalizeString(split_text[1]))

df = pd.DataFrame.from_dict(text_dict)
print(df.shape)
df.head()

(6752, 2)


Unnamed: 0,English,Indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !


In [4]:
import random
import re

names_dict = {"john":"john", "johnny":"johnny", "darren": "darren", "mommy":"ibu", "daddy": "ayah", 
              "gary": "gary", "jack": "jack", "ken": "ken", "michael": "michael", "sarah": "sarah", 
              "billy": "billy", "maria": "maria", "lim": "lim", "david": "david", "mia": "mia",
              "nancy": "nancy", "vijay": "vijay", "carrie": "carrie", "lee": "lee", "kim": "kim"}

def duplicate_sentences(dataframe, subject_name, names_dict):
    # replace tom with other subjects
    generated_dict = {"English": [], "Indonesian": []}
    filtered_df = dataframe[dataframe['English'].str.contains(subject_name)]
    
    for idx, row in filtered_df.iterrows():
        for idx in range(6):
            new_subject = random.choice(list(names_dict.keys()))
            new_subject_translated = names_dict[new_subject]
            
            eng = row["English"]
            ind = row["Indonesian"]

            match_eng = re.search(r'\b{}\b'.format(subject_name), eng)
            match_ind = re.search(r'\b{}\b'.format(subject_name), ind)

            if match_eng and match_ind:
                modified_eng = re.sub(r'\b{}\b'.format(subject_name), new_subject, eng)
                modified_ind = re.sub(r'\b{}\b'.format(subject_name), new_subject_translated, ind)
                
                generated_dict["English"].append(modified_eng)
                generated_dict["Indonesian"].append(modified_ind)

    return generated_dict
    
generated_tom_dict = duplicate_sentences(df, "tom", names_dict)
tom_df = pd.DataFrame.from_dict(generated_tom_dict)

generated_mary_dict = duplicate_sentences(df, "mary", names_dict)
mary_df = pd.DataFrame.from_dict(generated_mary_dict)

In [5]:
tom_df.head()

Unnamed: 0,English,Indonesian
0,lim won .,lim menang .
1,nancy won .,nancy menang .
2,lee won .,lee menang .
3,lim won .,lim menang .
4,david won .,david menang .


In [6]:
mary_df.head()

Unnamed: 0,English,Indonesian
0,vijay is graceful .,vijay adalah seorang yang anggun .
1,david is graceful .,david adalah seorang yang anggun .
2,mia is graceful .,mia adalah seorang yang anggun .
3,darren is graceful .,darren adalah seorang yang anggun .
4,ken is graceful .,ken adalah seorang yang anggun .


In [7]:
final_df = pd.concat([df, tom_df, mary_df])
final_df.head(5)

Unnamed: 0,English,Indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !


In [8]:
fp = open('./corpus/eng-indo-augmented.txt', 'w')

for idx, row in final_df.iterrows():
    fp.write(row["English"] + "\t" + row["Indonesian"] + "\n")