In [1]:
# Import

import tensorflow as tf
import numpy as np
import os
import re
import io
import pathlib
import unicodedata
import pickle

from sklearn.model_selection import train_test_split

In [2]:
# Some constants

# Number of sampled data to use during training. Set to 'None' for the whole dataset
NUM_EXAMPLES = 120000
# percentage of split for training/test
TRAIN_TEST_SPLIT = 0.2
# Current path
current_path = pathlib.Path().absolute()

In [3]:
# Path to the dataset file
path_to_file = os.path.join(current_path,"dataset","raw","por-eng","por.txt")

In [4]:
# Convert unicode to ascii character
# This removes accents
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [5]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, PORTUGUESE]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in line.split('\t')]
                for line in lines[:num_examples]]

    word_pairs_remove_CC_BY = []
    for i in range(len(word_pairs)):
        word_pairs_remove_CC_BY.append([word_pairs[i][0], word_pairs[i][1]])

    return zip(*word_pairs_remove_CC_BY)

# Tokenizer
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

# Load dataset
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [6]:
# Try experimenting with the size of that dataset
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, NUM_EXAMPLES)

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=TRAIN_TEST_SPLIT)

In [7]:
# Show length
print("input_train", "\ttarget_train", "\tinput_val", "\ttarget_val")
print(len(input_tensor_train), "\t\t"+str(len(target_tensor_train)), "\t\t"+str(len(input_tensor_val)), "\t\t"+str(len(target_tensor_val)))

input_train 	target_train 	input_val 	target_val
96000 		96000 		24000 		24000


In [8]:
# Fuction that converts the tokenized vector into words
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print(f'{t} ----> {lang.index_word[t]}')

In [9]:
print("Demonstration of index to word mapping")

print("Input Language")
convert(inp_lang, input_tensor_train[0])
print()
print("Target Language")
convert(targ_lang, target_tensor_train[0])

print(type(input_tensor_train))
print(type(inp_lang))

Demonstration of index to word mapping
Input Language
1 ----> <start>
4 ----> tom
96 ----> parece
937 ----> diferente
12 ----> de
309 ----> algum
2629 ----> modo
3 ----> .
2 ----> <end>

Target Language
1 ----> <start>
5 ----> tom
274 ----> seems
630 ----> different
2856 ----> somehow
3 ----> .
2 ----> <end>
<class 'numpy.ndarray'>
<class 'keras_preprocessing.text.Tokenizer'>


In [10]:
# Save tokenized dataset for training script

save_path = os.path.join(current_path,"dataset","por-eng")
if not os.path.exists(save_path):
    os.makedirs(save_path)

np.save(os.path.join(save_path,"input_tensor_train.npy"), input_tensor_train)
np.save(os.path.join(save_path,"input_tensor_val.npy"), input_tensor_val)
np.save(os.path.join(save_path,"target_tensor_train.npy"), target_tensor_train)
np.save(os.path.join(save_path,"target_tensor_val.npy"), target_tensor_train)

with open(os.path.join(save_path,"inp_lang.tokenizer"), 'wb') as handle:
    pickle.dump(inp_lang, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(save_path,"targ_lang.tokenizer"), 'wb') as handle:
    pickle.dump(targ_lang, handle, protocol=pickle.HIGHEST_PROTOCOL)