In [1]:
import transformer

In [13]:
import io
import os
import re
import time
import numpy
import tensorflow
import unicodedata
from sklearn import model_selection
from tensorflow import losses, optimizers, initializers
from tensorflow.keras import layers, models, preprocessing, utils

In [3]:
try:
    for device in tensorflow.config.experimental.list_physical_devices("GPU"):
        tensorflow.config.experimental.set_memory_growth(device, True)
except:
    print("Failed on enabling dynamic memory allocation on GPU devices!")

In [4]:
def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

In [5]:
def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        w = re.sub(r"([?.!,Â¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,Â¿]+", " ", w)
        w = w.rstrip().strip()
        w = "<start> " + w + " <end>"
        return w

In [6]:
en_sentence = u"Excuse me, may I borrow this book of Willian Shakespeare?"
pt_sentence = u"Olá, posso pegar emprestado esse livro de Willian Shakespeare?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(pt_sentence).encode("utf-8"))

<start> excuse me , may i borrow this book of willian shakespeare ? <end>
b'<start> ola , posso pegar emprestado esse livro de willian shakespeare ? <end>'


In [7]:
def create_dataset(path, num_examples):
        lines = io.open(path, encoding="UTF-8").read().strip().split('\n')
        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
        return zip(*word_pairs)

In [8]:
path_to_zip = utils.get_file("spa-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", extract=True)
path_to_file = os.path.dirname(path_to_zip) + "/spa-eng/spa.txt"
en, sp = create_dataset(path_to_file,None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [9]:
def max_length(tensor):
        return max(len(t) for t in tensor)

In [10]:
def tokenize(lang):
    lang_tokenizer = preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = preprocessing.sequence.pad_sequences(tensor, padding="post")
    return tensor, lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
        # creating cleaned input, output pairs
        targ_lang, inp_lang = create_dataset(path, num_examples)
        input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
        target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [14]:
#numero de sentencas que serao usadas
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = model_selection.train_test_split(input_tensor, target_tensor, test_size=0.33)
# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

20100 20100 9900 9900


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
print("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print()
print("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
12 ----> me
2736 ----> despidieron
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
26 ----> was
502 ----> fired
3 ----> .
2 ----> <end>


In [19]:
batch_size = 64
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [18]:
def Transformer(vocabulary_size, embedding_dimensions, encoding_units, decoding_units, recurrent_layer):
    T = models.Sequential()
    T.add(transformer.Encoder(vocabulary_size, embedding_dimensions, encoding_units, batch_size, recurrent_layer))
    T.add(transformer.Decoder())

array([[   1,   12, 2736, ...,    0,    0,    0],
       [   1,    6,    7, ...,    0,    0,    0],
       [   1,   25,   12, ...,    0,    0,    0],
       ...,
       [   1,   25,   16, ...,    0,    0,    0],
       [   1,   90,   88, ...,    0,    0,    0],
       [   1,   88,  154, ...,    0,    0,    0]])