# Imports

In [37]:
import numpy as np
import math
import re
import time
import zipfile
import random
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tqdm import tqdm

# Loading database

In [2]:
def load_db(file_path):
    with open(file_path) as f:
        return f.read()

euro_en = load_db('pt-en/europarl-v7.pt-en.en')
euro_pt = load_db('pt-en/europarl-v7.pt-en.pt')

In [8]:
print('en sample example: ', euro_en.split('\n')[0])
print('pt sample example: ', euro_pt.split('\n')[0])

en sample example:  Resumption of the session
pt sample example:  Reinício da sessão


# Data cleaning

In [9]:
def data_cleaning(data):
    data = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '$$', data)
    data = re.sub(r'\$\$', '', data)
    data = re.sub(r' +', ' ', data)
    return data.split('\n')

data_en = data_cleaning(data=euro_en)
data_pt = data_cleaning(data=euro_pt)

print('Data en: ', data_en[10])
print('Data pt: ', data_pt[10])

Data en:  Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?
Data pt:  Será que a senhora Presidente poderia enviar uma carta à Presidente do Sri Lanka manifestando o pesar do Parlamento por esta e outras mortes violentas perpetradas no seu país, e instando­a a envidar todos os esforços ao seu alcance para procurar obter uma reconciliação pacífica na situação extremamente difícil que ali se vive?


In [10]:
print('en data size: {} | pt data size: {}'.format(len(data_en), len(data_pt)))

en data size: 1960408 | pt data size: 1960408


# Tokenization

In [11]:
def tokenizer_data(data, vocab_size=2**13):
    return tfds.features.text.SubwordTextEncoder.build_from_corpus(data, target_vocab_size=vocab_size)

tokenizer_en = tokenizer_data(data=data_en)
tokenizer_pt = tokenizer_data(data=data_pt)

print('En vocab size: ', tokenizer_en.vocab_size)
print('Pt vocab size: ', tokenizer_pt.vocab_size)

En vocab size:  8191
Pt vocab size:  8116


In [14]:
def token_start_end(data, tokenizer):
    vocab_size = tokenizer.vocab_size + 2
    # adding start and end token in each setense
    return [[vocab_size - 2] + tokenizer.encode(sentense) + [vocab_size - 1] for sentense in data]

inputs = token_start_end(data=data_en, tokenizer=tokenizer_en)
outputs = token_start_end(data=data_pt, tokenizer=tokenizer_pt)

[print(inputs[i] )for i in range(5)]
[print(outputs[i]) for i in range(5)]

[8191, 2458, 972, 2108, 3, 1, 2571, 8192]
[8191, 11, 5645, 7093, 1, 2634, 3, 1, 23, 67, 2144, 4821, 50, 12, 6727, 7967, 3951, 1446, 1974, 2, 5, 11, 33, 57, 414, 397, 4, 311, 55, 7, 1684, 79, 323, 6, 1, 210, 8, 55, 4288, 43, 7, 5242, 1654, 7697, 742, 1765, 7981, 8192]
[8191, 944, 2008, 2, 21, 55, 26, 18, 1436, 8045, 2, 1, 3263, 3127, 8035, 186, 1756, 2997, 7472, 493, 1691, 8038, 90, 2222, 4, 4498, 3815, 2, 175, 1, 102, 6, 7, 214, 3, 103, 4142, 7, 2174, 3, 1035, 4956, 8, 1995, 117, 3263, 2144, 2230, 7981, 8192]
[8191, 554, 18, 3508, 7, 179, 12, 16, 435, 6, 1, 454, 3, 1, 274, 319, 3313, 2, 353, 16, 679, 7980, 2571, 7981, 8192]
[8191, 62, 1, 7496, 2, 11, 35, 57, 4, 7157, 7, 4732, 90, 13, 7835, 8036, 2, 21, 7, 214, 3, 321, 18, 2628, 93, 2, 12, 264, 3, 40, 1, 1275, 587, 2, 235, 99, 3, 1, 5218, 2261, 3500, 2, 6, 1, 417, 103, 3, 1, 23, 113, 7981, 8192]
[8116, 834, 705, 7, 3561, 8117]
[8116, 4808, 1981, 6, 7004, 2487, 3, 3, 1192, 8, 50, 236, 1, 4, 1109, 218, 5803, 3997, 1152, 22, 7016, 7957, 79

[None, None, None, None, None]

removing setense longer than 15 

In [38]:
def remove_longer_sentense(data, max_length=15):
    idx_to_remove = [idx for idx, sentense in enumerate(data) if len(sentense) > max_length]

    for idx in tqdm(reversed(idx_to_remove)):
        # remove the same setense from the data
        del inputs[idx]
        del outputs[idx]

remove_longer_sentense(data=inputs)
remove_longer_sentense(data=outputs)

print('len inputs: {} | len outputs {}'.format(len(inputs), len(outputs)))

0it [00:00, ?it/s]
66118it [00:09, 6859.60it/s]  

len inputs: 208990 | len outputs 208990



