In [6]:
import pathlib
import random
import pickle
import string
import re
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization

In [7]:
#This block is just used to get test sentences to translate and should be removed
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

with open(text_file, encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    eng = "[start] " + eng + " [end]"
    text_pairs.append((spa, eng))


random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [8]:
spa_strip_chars = string.punctuation + "¿"
eng_strip_chars = string.punctuation.replace("[", "")
eng_strip_chars = eng_strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 25
batch_size = 64


def eng_custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(eng_strip_chars), "")

def spa_custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(spa_strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=eng_custom_standardization,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=spa_custom_standardization,
)

In [9]:
#Loading transformer and vectorization of both languages
from_disk = pickle.load(open("spa_vectorization.pkl", "rb"))
spa_vectorization = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
spa_vectorization.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
spa_vectorization.set_weights(from_disk['weights'])


from_disk = pickle.load(open("eng_vectorization.pkl", "rb"))
eng_vectorization = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
eng_vectorization.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
eng_vectorization.set_weights(from_disk['weights'])


transformer = tf.saved_model.load('translator-transformer')

In [10]:
eng_vocab = eng_vectorization.get_vocabulary()
eng_index_lookup = dict(zip(range(len(eng_vocab)), eng_vocab))
max_decoded_sentence_length = 25


def decode_sequence(input_sentence):
    tokenized_input_sentence = spa_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = eng_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = eng_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence

test_spa_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_spa_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)
    print("===================")

['Esa es mi escuela.', 'No estaba borracho.', 'El anciano era querido por todos.', 'Quiero una respuesta a esa pregunta.', 'Pensé que tal vez quisieran saber.', 'En el alfabeto, la B va después de la A.', 'Cerrá los ojos por tres minutos.', 'Para mi sorpresa, ella no pudo contestar a la pregunta.', 'Ella participó en el concurso.', 'Me costó un rato largo el asimilar lo que ella estaba diciendo.']
Él es muy guapo.
[start] he is very goodlooking [end]
Extraño tanto París.
[start] i miss paris so much [end]
Sin su ayuda, yo estaría muerto.
[start] without her help i would be dead [end]
Tom no parece querer nuestra ayuda, ¿verdad?
[start] tom doesnt seem to want our help [end]
Quiero ver a tu hermana mayor.
[start] i want to see your older sister [end]
Me lo encontré de casualidad por la calle.
[start] i met him on the street [end]
Tom es el amigo de Mary.
[start] tom is marys friend [end]
El detective se disfrazó de viejo caballero.
[start] the the [end]
Tom se puso a cubierto.
[start] t