In [1]:
import pathlib
import random
import string
import re
import json
import numpy as np
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization
import pickle

In [3]:
print(tf. __version__)
print(device_lib.list_local_devices())
print("==================")
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.9.1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14203181691374009
xla_global_id: -1
]
Num GPUs Available:  0


In [7]:
# text_file = keras.utils.get_file(
#     fname="spa-eng.zip",
#     origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
#     extract=True,
# )
# text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [8]:
# with open(text_file, encoding="utf8") as f:
#     lines = f.read().split("\n")[:-1]
# text_pairs = []
# for line in lines:
#     eng, spa = line.split("\t")
#     eng = "[start] " + eng + " [end]"
#     text_pairs.append((spa, eng))

In [9]:
spa_text_file = './../corpus/es-en/spa.txt'
with open(spa_text_file, encoding="utf8") as f:
    spa_lines = f.read().split("\n")[:-1]

eng_text_file = './../corpus/es-en/eng.txt'
with open(eng_text_file, encoding="utf8") as f:
    eng_lines = f.read().split("\n")[:-1]

text_pairs = []
for (spa, eng) in zip(spa_lines, eng_lines):
    eng = "[start] " + eng + " [end]"
    text_pairs.append((spa, eng))

In [10]:
for _ in range(5):
    print(random.choice(text_pairs))

('Si bien comprendo las motivaciones de la Comisión, esta tarde quiero decir que, globalmente, no apruebo la propuesta de decisión que formula, por lo demás, en perfecta coherencia con la Comisión de Asuntos Económicos y Monetarios y de Política Industrial.', "[start] Whilst I understand the Commission's thinking, I have to say that, on the whole, I do not support its proposal for a decision, an attitude which is very much in line with that of the Committee on Economic and Monetary Affairs and Industrial Policy. [end]")
('¿Hay alguna observación?', '[start] Are there any comments? [end]')
('No olvidemos que mientras que en Europa gastamos entre el 10 % y el 20 % de los ingresos domésticos en los alimentos, hay sociedades en las que esta partida representa el 90 % o más, y es fácil imaginar lo que una subida de los precios significaría para estas vastas poblaciones tan necesitadas.', '[start] Let us not forget that while in Europe we spend 10-20% of household incomes on food, there are 

In [23]:
random.shuffle(text_pairs)
# num_val_samples = int(0.15 * len(text_pairs))
num_val_samples = int(0.4 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

1965734 total pairs
393148 training pairs
786293 validation pairs
786293 test pairs


In [12]:
spa_strip_chars = string.punctuation + "¿"
eng_strip_chars = string.punctuation.replace("[", "")
eng_strip_chars = eng_strip_chars.replace("]", "")

vocab_size = 30000
sequence_length = 25
batch_size = 64


def eng_custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(eng_strip_chars), "")

def spa_custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(spa_strip_chars), "")

#turn the original strings into integer sequences where each integer represents the index of a word in a vocabulary.
eng_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=eng_custom_standardization,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=spa_custom_standardization,
)
train_spa_texts = [pair[0] for pair in train_pairs]
train_eng_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [13]:
def format_dataset(spa, eng):
    spa = spa_vectorization(spa)
    eng = eng_vectorization(eng)
    return (
        {
            "encoder_inputs": spa,
            "decoder_inputs": eng[:, :-1],
        },
        eng[:, 1:],
    )


def make_dataset(pairs):
    spa_texts, eng_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((spa_texts, eng_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 25)
inputs["decoder_inputs"].shape: (64, 25)
targets.shape: (64, 25)


In [14]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


In [15]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
#Model groups layers into an object with training and inference features.
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [16]:
epochs = 1 # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)


Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3846400     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   3155456     ['positional_embedding[

KeyboardInterrupt: 

In [None]:
tf.saved_model.save(transformer, export_dir='translator-transformer2')
pickle.dump({'config': spa_vectorization.get_config(),
             'weights': spa_vectorization.get_weights()}
            , open("spa_vectorization2.pkl", "wb"))
pickle.dump({'config': eng_vectorization.get_config(),
             'weights': eng_vectorization.get_weights()}
            , open("eng_vectorization2.pkl", "wb"))



INFO:tensorflow:Assets written to: translator-transformer\assets


INFO:tensorflow:Assets written to: translator-transformer\assets


In [None]:
eng_vocab = eng_vectorization.get_vocabulary()
eng_index_lookup = dict(zip(range(len(eng_vocab)), eng_vocab))
max_decoded_sentence_length = 25


def decode_sequence(input_sentence):
    tokenized_input_sentence = spa_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = eng_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = eng_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_spa_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_spa_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)
    print("===================")

El cuchillo era tan romo no pude usarlo para cortar la carne, así que tuve que depender de mi navaja de bolsillo.
[start] the knife is so that i cant cut the yellow idea with him so do that i have a key to my that i have
Tom come como un caballo.
[start] tom eats like a horse [end]
Él viajó alrededor del mundo.
[start] he traveled around the world [end]
Tom olvidó hacer lo que se suponía que hiciera.
[start] tom forgot to what he was supposed to do [end]
Llegaremos a nuestro destino para cuando el sol se ponga.
[start] we will arrive in our the foot in the sun the sun the foot [end]
Eso es realmente estúpido.
[start] thats really stupid [end]
Me duele mi estomago.
[start] i have a [end]
Tom le dijo a María que no tenía suficiente dinero.
[start] tom told mary that she didnt have enough money [end]
¡Váyanse a dormir!
[start] get it for sleep [end]
¿Cuántos CDs tenés?
[start] how many traffic do you have [end]
Descansemos un rato.
[start] i will write down for a while [end]
Tom se comió 