# Imports

In [11]:
import numpy as np
import math
import re
import time
import zipfile
import random
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tqdm import tqdm

# Loading database

In [12]:
def load_db(file_path):
    with open(file_path) as f:
        return f.read()

euro_en = load_db('pt-en/europarl-v7.pt-en.en')
euro_pt = load_db('pt-en/europarl-v7.pt-en.pt')

In [13]:
print('en sample example: ', euro_en.split('\n')[0])
print('pt sample example: ', euro_pt.split('\n')[0])

en sample example:  Resumption of the session
pt sample example:  Reinício da sessão


# Data cleaning

In [14]:
def data_cleaning(data):
    data = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '$$', data)
    data = re.sub(r'\$\$', '', data)
    data = re.sub(r' +', ' ', data)
    return data.split('\n')

data_en = data_cleaning(data=euro_en)
data_pt = data_cleaning(data=euro_pt)

print('Data en: ', data_en[10])
print('Data pt: ', data_pt[10])

Data en:  Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?
Data pt:  Será que a senhora Presidente poderia enviar uma carta à Presidente do Sri Lanka manifestando o pesar do Parlamento por esta e outras mortes violentas perpetradas no seu país, e instando­a a envidar todos os esforços ao seu alcance para procurar obter uma reconciliação pacífica na situação extremamente difícil que ali se vive?


In [15]:
print('en data size: {} | pt data size: {}'.format(len(data_en), len(data_pt)))

en data size: 1960408 | pt data size: 1960408


# Tokenization

In [16]:
def tokenizer_data(data, vocab_size=2**13):
    return tfds.features.text.SubwordTextEncoder.build_from_corpus(data, target_vocab_size=vocab_size)

tokenizer_en = tokenizer_data(data=data_en)
tokenizer_pt = tokenizer_data(data=data_pt)

print('En vocab size: ', tokenizer_en.vocab_size)
print('Pt vocab size: ', tokenizer_pt.vocab_size)

En vocab size:  8191
Pt vocab size:  8116


In [17]:
def token_start_end(data, tokenizer):
    vocab_size = tokenizer.vocab_size + 2
    # adding start and end token in each setense
    return [[vocab_size - 2] + tokenizer.encode(sentense) + [vocab_size - 1] for sentense in data]

inputs = token_start_end(data=data_en, tokenizer=tokenizer_en)
outputs = token_start_end(data=data_pt, tokenizer=tokenizer_pt)

print('Input example: ', inputs[0])
print('Output example: ', outputs[0])

Input example:  [8191, 2458, 972, 2108, 3, 1, 2571, 8192]
Output example:  [8116, 834, 705, 7, 3561, 8117]


removing setense longer than 15 

In [18]:
def remove_longer_sentense(data, max_length=15):
    idx_to_remove = [idx for idx, sentense in enumerate(data) if len(sentense) > max_length]

    for idx in tqdm(reversed(idx_to_remove)):
        # remove the same setense from the data
        del inputs[idx]
        del outputs[idx]

remove_longer_sentense(data=inputs)
remove_longer_sentense(data=outputs)

print('len inputs: {} | len outputs {}'.format(len(inputs), len(outputs)))

1685300it [06:45, 4152.61it/s] 
66118it [00:09, 6720.51it/s]  

len inputs: 208990 | len outputs 208990





padding sentenses 

In [19]:
def padding_sequences(data, max_length):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences=data, value=0, padding='post', maxlen=max_length)

inputs = padding_sequences(data=inputs, max_length=15)
outputs = padding_sequences(data=outputs, max_length=15)

print('Input padded sequences example: ', inputs[0])
print('Output padded sequences example: ', outputs[0])

Input padded sequences example:  [8191 2458  972 2108    3    1 2571 8192    0    0    0    0    0    0
    0]
Output padded sequences example:  [8116  834  705    7 3561 8117    0    0    0    0    0    0    0    0
    0]


creating final dataset with tf optimization

In [20]:
batch_size = 64
buffer_size = 20000

dataset = tf.data.Dataset.from_tensor_slices(tensors=(inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
dataset = dataset.prefetch(tf.data.AUTOTUNE)