In [47]:
import tensorflow as tf
from tensorflow.keras.layers import Dot, Activation, Reshape, Dense, Embedding, LSTM, GRU, Dropout, BatchNormalization, RepeatVector, TimeDistributed, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras import Model, Input
from unicodedata import normalize
from random import shuffle
import numpy as np
import string
import re

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
file = open('spa.txt', mode='rt', encoding='utf-8')
text = file.read()
file.close()

REFERENCE

Par de senteças - http://www.manythings.org/anki/

# Spliting text

In [4]:
lines = text.strip().split('\n')
text_split = [line.split('\t') for line in  lines]
text_split[:10]

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', '¡Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', '¿Quién?'],
 ['Wow!', '¡Órale!'],
 ['Fire!', '¡Fuego!']]

In [5]:
def clean_text(lines):   
    text_cleaned = []
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        text_cleaned.append(clean_pair)
    
    return np.array(text_cleaned)

In [6]:
text_cleaned = clean_text(text_split)
text_cleaned[:30]

array([['go', 've'],
       ['go', 'vete'],
       ['go', 'vaya'],
       ['go', 'vayase'],
       ['hi', 'hola'],
       ['run', 'corre'],
       ['run', 'corred'],
       ['who', 'quien'],
       ['wow', 'orale'],
       ['fire', 'fuego'],
       ['fire', 'incendio'],
       ['fire', 'disparad'],
       ['help', 'ayuda'],
       ['help', 'socorro auxilio'],
       ['help', 'auxilio'],
       ['jump', 'salta'],
       ['jump', 'salte'],
       ['stop', 'parad'],
       ['stop', 'para'],
       ['stop', 'pare'],
       ['wait', 'espera'],
       ['wait', 'esperen'],
       ['go on', 'continua'],
       ['go on', 'continue'],
       ['hello', 'hola'],
       ['i ran', 'corri'],
       ['i ran', 'corria'],
       ['i try', 'lo intento'],
       ['i won', 'he ganado'],
       ['oh no', 'oh no']], dtype='<U328')

In [7]:
text_cleaned[:-10]

array([['go', 've'],
       ['go', 'vete'],
       ['go', 'vaya'],
       ...,
       ['you cant easily put photos on an ipad from more than one computer however you can email photos to yourself from various computers and download these photos to your ipad',
        'usted no puede poner facilmente fotografias en un ipad de mas de una computadora sin embargo puede enviarse fotografias de varias computadoras y descargarlas a su ipad'],
       ['you cant view flash content on an ipad however you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home',
        'usted no puede ver contenido flash en un ipad sin embargo puede enviarse el url de esas paginas web y ver el contenido en su computadora ordinaria al llegar a su hogar'],
       ['you cant view flash content on an ipad however you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home',
        'no puedes 

In [8]:
text_cleaned.shape

(119936, 2)

# Creating Training/Testing datasets:

In [9]:
n_setences = 20000
dataset = text_cleaned[:n_setences, :]
shuffle(dataset)
train, test = dataset[:18000], dataset[18000:]

In [10]:
dataset[0]

array(['go', 've'], dtype='<U328')

# Preprocessing the data:

In [11]:
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [12]:
def max_length(lines):
  return max(len(line.split()) for line in lines)

# English Tokenizer

In [13]:
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab = len(eng_tokenizer.word_index) + 1
eng_len = max_length(dataset[:, 0])
print('English Vocabulary Size:',eng_vocab)
print('English Max Length',eng_len)

English Vocabulary Size: 2609
English Max Length 6


# Spanish Tokenizer

In [14]:
spa_tokenizer = create_tokenizer(dataset[:,1])
spa_vocab = len(spa_tokenizer.word_index) + 1
spa_len = max_length(dataset[:,1])
print('Spanish Vocabulary Size:',spa_vocab)
print('Spanish Max Length',spa_len)

Spanish Vocabulary Size: 4763
Spanish Max Length 10


# Creating Datasets:

In [15]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [16]:
def encode_output(sequences, vocab_size):
    y_list = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        y_list.append(encoded)
    y = np.array(y_list)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [17]:
# Training data
train_X = encode_sequences(spa_tokenizer, spa_len, train[:, 1])
train_Y = encode_sequences(eng_tokenizer, eng_len, train[:, 0])
train_Y = encode_output(train_Y, eng_vocab)

In [29]:
# Test data
test_X = encode_sequences(spa_tokenizer, spa_len, test[:, 1])
test_Y = encode_sequences(eng_tokenizer, eng_len, test[:, 0])
test_Y = encode_output(test_Y, eng_vocab)

In [19]:
test_X[0]

array([  5,   4,  52, 145,   0,   0,   0,   0,   0,   0], dtype=int32)

In [20]:
test_Y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
test_Y[0].shape

(6, 2609)

# Creating the Model

In [22]:
embeed_size = 150

In [27]:
model = Sequential()
model.add(Embedding(spa_vocab, embeed_size, input_length=spa_len))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(300)))
model.add(BatchNormalization())
model.add(RepeatVector(eng_len)) # meio que aqui eu vou repetir pelo vetor máximo de palavras.
model.add(Bidirectional(LSTM(300, return_sequences=True)))
model.add(BatchNormalization())
model.add(TimeDistributed(Dense(eng_vocab, activation='softmax')))

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 150)           714450    
_________________________________________________________________
batch_normalization (BatchNo (None, 10, 150)           600       
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1082400   
_________________________________________________________________
batch_normalization_1 (Batch (None, 600)               2400      
_________________________________________________________________
repeat_vector (RepeatVector) (None, 6, 600)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 6, 600)            2162400   
_________________________________________________________________
batch_normalization_2 (Batch (None, 6, 600)           

In [32]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='categorical_crossentropy',metrics=['accuracy'])

In [34]:
filename = 'model_translation.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True,mode='min')

In [35]:
%%time
model.fit(train_X, train_Y, epochs=200, batch_size=128, validation_data=(test_X, test_Y), callbacks=[checkpoint])

Epoch 1/200

Epoch 00001: val_loss improved from inf to 11.25198, saving model to model_translation.h5
Epoch 2/200

Epoch 00002: val_loss did not improve from 11.25198
Epoch 3/200

Epoch 00003: val_loss did not improve from 11.25198
Epoch 4/200

Epoch 00004: val_loss did not improve from 11.25198
Epoch 5/200

Epoch 00005: val_loss did not improve from 11.25198
Epoch 6/200

Epoch 00006: val_loss improved from 11.25198 to 5.98366, saving model to model_translation.h5
Epoch 7/200

Epoch 00007: val_loss improved from 5.98366 to 1.56813, saving model to model_translation.h5
Epoch 8/200

Epoch 00008: val_loss did not improve from 1.56813
Epoch 9/200

Epoch 00009: val_loss did not improve from 1.56813
Epoch 10/200

Epoch 00010: val_loss did not improve from 1.56813
Epoch 11/200

Epoch 00011: val_loss did not improve from 1.56813
Epoch 12/200

Epoch 00012: val_loss did not improve from 1.56813
Epoch 13/200

Epoch 00013: val_loss did not improve from 1.56813
Epoch 14/200

Epoch 00014: val_loss 

KeyboardInterrupt: ignored

# Predicting Text with Test

In [36]:
# Duas funções para transformar categoricas em palavras
def word_int(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_sequence(model, tokenizer, value):
    prediction = model.predict(value, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_int(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [40]:
actual, predicted = [], []
for i, value in enumerate(test_X[:50]):
        value = value.reshape((1, value.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, value)
        _target, _src = test[i]
        #if i < 20:
        print('src=[%s], target=[%s], prediction=[%s]' % (_src, _target, translation))
        actual.append(_target.split())
        predicted.append(translation.split())

src=[el no fue alli], target=[he didnt go there], prediction=[he wasnt there]
src=[estas borracho], target=[youre drunk], prediction=[youre drunk]
src=[como va todo], target=[how is everything], prediction=[how is]
src=[dios existe], target=[god exists], prediction=[god exists]
src=[hagamoslo], target=[lets do it], prediction=[lets do it]
src=[quiero comermelo], target=[i want to eat it], prediction=[i want to]
src=[dejanos], target=[leave us], prediction=[leave us]
src=[ya has votado], target=[did you vote yet], prediction=[you already]
src=[cualquier libro servira], target=[any book will do], prediction=[any book will do]
src=[sabiamos eso], target=[we knew that], prediction=[we knew that]
src=[cierra la caja fuerte], target=[close the safe], prediction=[close the breath]
src=[esta en mi bolsillo], target=[its in my pocket], prediction=[its in my pocket]
src=[que significa], target=[what does it mean], prediction=[what about]
src=[aquel hombre es tomas], target=[that man is tom], pre

# Testando com Attention

In [None]:
seq_input = Input(shape=(max_len,), dtype='int32')
embedded = Embedding(vocab_size,
                     embedding_dim,
                     input_length=max_len)(seq_input)
embedded = Dropout(0.2)(embedded)
lstm = Bidirectional(LSTM(embedding_dim, return_sequences=True))(embedded)
lstm = Dropout(0.2)(lstm)
# Attention Mechanism
att_vector = TimeDistributed(Dense(1))(lstm)
att_vector = Reshape((max_len,))(att_vector)
att_vector = Activation('softmax', name='attention_vec')(att_vector)
att_output = Dot(axes=1)([lstm, att_vector])
# Final Layers
fc = Dense(embedding_dim, activation='relu')(att_output)
output = Dense(len(label2id), activation='softmax')(fc)

model = Model(inputs=[seq_input], outputs=output)

In [69]:
spa_len, spa_vocab, embeed_size, eng_len, eng_vocab

(10, 4763, 150, 6, 2609)

In [72]:
seq_input = Input(shape=(spa_len,), dtype='int32')
embedded = Embedding(spa_vocab, embeed_size)(seq_input)
lstm = Bidirectional(LSTM(embeed_size, return_sequences=True))(embedded)
batchNorm = BatchNormalization()(lstm)
lstm_ = Dropout(0.2)(batchNorm)
# Attention Mechanism
att_vector = TimeDistributed(Dense(1))(lstm_)
att_vector2 = Reshape((spa_len,))(att_vector)
att_vector3 = Activation('softmax', name='attention_vec')(att_vector2)
att_output = Dot(axes=1)([lstm_, att_vector3])
# Final Layers
repeat_vector = RepeatVector(eng_len)(att_output)
lstm2 = Bidirectional(LSTM(embeed_size, return_sequences=True))(repeat_vector)
batchNorm2 = BatchNormalization()(lstm2)
lstm2_ = Dropout(0.2)(batchNorm2)
output = TimeDistributed(Dense(eng_vocab, activation='softmax'))(lstm2_)

modelAtt = Model(inputs=[seq_input], outputs=output)

In [73]:
modelAtt.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 10, 150)      714450      input_12[0][0]                   
__________________________________________________________________________________________________
bidirectional_19 (Bidirectional (None, 10, 300)      361200      embedding_12[0][0]               
__________________________________________________________________________________________________
batch_normalization_20 (BatchNo (None, 10, 300)      1200        bidirectional_19[0][0]           
____________________________________________________________________________________________

In [74]:
modelAtt.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='categorical_crossentropy',metrics=['accuracy'])

In [75]:
filename = 'model_translation_Attention.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True,mode='min')

In [76]:
%%time
modelAtt.fit(train_X, train_Y, epochs=50, batch_size=128, validation_data=(test_X, test_Y), callbacks=[checkpoint])

Epoch 1/50

Epoch 00001: val_loss improved from inf to 7.02173, saving model to model_translation_Attention.h5
Epoch 2/50

Epoch 00002: val_loss improved from 7.02173 to 6.52215, saving model to model_translation_Attention.h5
Epoch 3/50

Epoch 00003: val_loss improved from 6.52215 to 3.57109, saving model to model_translation_Attention.h5
Epoch 4/50

Epoch 00004: val_loss improved from 3.57109 to 2.58781, saving model to model_translation_Attention.h5
Epoch 5/50

Epoch 00005: val_loss improved from 2.58781 to 1.70605, saving model to model_translation_Attention.h5
Epoch 6/50

Epoch 00006: val_loss did not improve from 1.70605
Epoch 7/50

Epoch 00007: val_loss improved from 1.70605 to 1.61411, saving model to model_translation_Attention.h5
Epoch 8/50

Epoch 00008: val_loss improved from 1.61411 to 1.60553, saving model to model_translation_Attention.h5
Epoch 9/50

Epoch 00009: val_loss did not improve from 1.60553
Epoch 10/50

Epoch 00010: val_loss did not improve from 1.60553
Epoch 11/

<keras.callbacks.History at 0x7fd0c456ec10>

In [77]:
actual, predicted = [], []
for i, value in enumerate(test_X[:50]):
        value = value.reshape((1, value.shape[0]))
        translation = predict_sequence(modelAtt, eng_tokenizer, value)
        _target, _src = test[i]
        #if i < 20:
        print('src=[%s], target=[%s], prediction=[%s]' % (_src, _target, translation))
        actual.append(_target.split())
        predicted.append(translation.split())

src=[el no fue alli], target=[he didnt go there], prediction=[he wasnt that there]
src=[estas borracho], target=[youre drunk], prediction=[youre drunk]
src=[como va todo], target=[how is everything], prediction=[how eat it]
src=[dios existe], target=[god exists], prediction=[god exists]
src=[hagamoslo], target=[lets do it], prediction=[lets do it]
src=[quiero comermelo], target=[i want to eat it], prediction=[i want the]
src=[dejanos], target=[leave us], prediction=[leave us]
src=[ya has votado], target=[did you vote yet], prediction=[you lost now]
src=[cualquier libro servira], target=[any book will do], prediction=[any book will do]
src=[sabiamos eso], target=[we knew that], prediction=[we knew that]
src=[cierra la caja fuerte], target=[close the safe], prediction=[close the safe]
src=[esta en mi bolsillo], target=[its in my pocket], prediction=[its in my pocket]
src=[que significa], target=[what does it mean], prediction=[no no]
src=[aquel hombre es tomas], target=[that man is tom],

In [78]:
model_ = load_model(filename)

In [79]:
actual, predicted = [], []
for i, value in enumerate(test_X[:50]):
        value = value.reshape((1, value.shape[0]))
        translation = predict_sequence(model_, eng_tokenizer, value)
        _target, _src = test[i]
        #if i < 20:
        print('src=[%s], target=[%s], prediction=[%s]' % (_src, _target, translation))
        actual.append(_target.split())
        predicted.append(translation.split())

src=[el no fue alli], target=[he didnt go there], prediction=[he wont there]
src=[estas borracho], target=[youre drunk], prediction=[youre you drunk]
src=[como va todo], target=[how is everything], prediction=[how all like like]
src=[dios existe], target=[god exists], prediction=[god exists]
src=[hagamoslo], target=[lets do it], prediction=[lets do it]
src=[quiero comermelo], target=[i want to eat it], prediction=[i want to]
src=[dejanos], target=[leave us], prediction=[leave us]
src=[ya has votado], target=[did you vote yet], prediction=[youre look come]
src=[cualquier libro servira], target=[any book will do], prediction=[any you do will]
src=[sabiamos eso], target=[we knew that], prediction=[we knew that]
src=[cierra la caja fuerte], target=[close the safe], prediction=[open the box]
src=[esta en mi bolsillo], target=[its in my pocket], prediction=[its in pocket pocket]
src=[que significa], target=[what does it mean], prediction=[how lovely]
src=[aquel hombre es tomas], target=[that