In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

## Descargar dataset

In [2]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

## Preparar dataset

In [3]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [4]:
for _ in range(5):
    print(random.choice(text_pairs))

('This smells like cheese.', '[start] Esto huele a queso. [end]')
('How old is this airplane?', '[start] ¿Qué antigüedad tiene el avión? [end]')
("As long as you're here, you'd better take a bath.", '[start] Ya que estás aquí, sería mejor que te metieras a la tina. [end]')
("I'll have to tell her the truth tomorrow.", '[start] Tendré que decirle la verdad mañana. [end]')
("Do you think it'll rain today?", '[start] ¿Crees que hoy va a llover? [end]')


In [5]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


## Tokenización

In [6]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 6
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", 
    output_sequence_length=sequence_length,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [7]:
eng_vectorization([['my name is'], ['my dog is']])

<tf.Tensor: shape=(2, 6), dtype=int64, numpy=
array([[ 19, 234,   8,   0,   0,   0],
       [ 19, 169,   8,   0,   0,   0]])>

In [8]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [9]:
def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return eng, spa


def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(AUTOTUNE).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [10]:
for inputs, targets in train_ds.take(1):
    print(inputs[0], targets[0])

tf.Tensor([  21   54   42 4300  902    0], shape=(6,), dtype=int64) tf.Tensor([   2    6   26   25 1562   10], shape=(6,), dtype=int64)


## Definir modelo

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [11]:
emb_dim = 256
model_dim = 1024

## Encoder

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(voc_size,
                                                   emb_dim)
        self.gru = tf.keras.layers.GRU(model_dim,
                                       return_sequences=False,
                                       return_state=True)

    def call(self, x, state=None):
        x = self.embedding(x)
        x, state = self.gru(x, initial_state=state)
        return x, state
    
    
encoder = Encoder(eng_vectorization.vocabulary_size(),
                  emb_dim, model_dim)
output, enc_state = encoder(inputs)
enc_state

Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget


<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
array([[ 1.4115733e-03, -5.6641502e-03,  7.7767288e-03, ...,
        -2.1856262e-03,  6.8079270e-03, -3.3179838e-03],
       [-1.5338659e-04,  8.9565560e-04,  1.4615036e-02, ...,
        -2.6571031e-03,  1.5832810e-02, -1.4347886e-02],
       [ 8.2857776e-03,  7.9951007e-03, -5.5211955e-03, ...,
         2.2308517e-03,  4.4789212e-03, -2.3887341e-03],
       ...,
       [ 3.7649382e-04, -3.6410286e-04,  1.4612420e-02, ...,
        -1.1922809e-03,  1.3411153e-02, -1.2046176e-02],
       [ 6.9277938e-03,  3.2389206e-03, -9.4464412e-03, ...,
         1.0154737e-02, -1.6668520e-05,  3.8765070e-03],
       [ 8.1081139e-03, -6.4555574e-03, -1.3942542e-03, ...,
         9.1293324e-03, -8.1743661e-04, -1.4563942e-02]], dtype=float32)>

In [13]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  3074048   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
Total params: 7,012,352
Trainable params: 7,012,352
Non-trainable params: 0
_________________________________________________________________


In [14]:
targets[:, :1].shape

TensorShape([64, 1])

## Decoder

In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, x, state, training=False):
        x = self.embedding(x, training=training)
        x, state = self.gru(x, initial_state=state, training=training)
        x = self.logits(x, training=training)

        return x, state


decoder = Decoder(voc_size=spa_vectorization.vocabulary_size(),
                  emb_dim=emb_dim,
                  model_dim=model_dim)

decoder(targets[:, :1], enc_state)

(<tf.Tensor: shape=(64, 1, 15000), dtype=float32, numpy=
 array([[[ 7.8998192e-04, -3.4712984e-03,  2.2515412e-03, ...,
           7.3303047e-05, -2.7270168e-03, -8.6499471e-04]],
 
        [[ 1.3638984e-03, -5.3860024e-03,  1.9304282e-03, ...,
           4.8100349e-04, -4.7541675e-03,  1.2959538e-03]],
 
        [[-2.4067070e-03,  2.5511777e-04,  3.1165795e-03, ...,
           1.5344174e-03, -2.0976199e-03,  1.0727898e-03]],
 
        ...,
 
        [[ 8.9684944e-04, -5.3197448e-03,  2.0059876e-03, ...,
           7.4011838e-04, -4.6217931e-03,  1.0152514e-03]],
 
        [[ 1.7780627e-03, -2.9438213e-03,  2.9751225e-03, ...,
           2.0170195e-03,  5.7476037e-04,  1.6370494e-04]],
 
        [[ 6.5653381e-04, -2.8479625e-03,  3.2723062e-03, ...,
           1.2746839e-03, -1.5705393e-03,  8.6521095e-04]]], dtype=float32)>,
 <tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[-5.4189241e-03,  1.1348363e-03,  1.8043757e-03, ...,
         -2.6944506e-03, -4.5230445e-03, -2.327

In [16]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  3840000   
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  15375000  
                                                                 
Total params: 23,153,304
Trainable params: 23,153,304
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento

In [17]:
opt = tf.keras.optimizers.Adam(0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [18]:
_, state = encoder(inputs)
state

<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
array([[ 1.4115733e-03, -5.6641502e-03,  7.7767288e-03, ...,
        -2.1856262e-03,  6.8079270e-03, -3.3179838e-03],
       [-1.5338659e-04,  8.9565560e-04,  1.4615036e-02, ...,
        -2.6571031e-03,  1.5832810e-02, -1.4347886e-02],
       [ 8.2857776e-03,  7.9951007e-03, -5.5211955e-03, ...,
         2.2308517e-03,  4.4789212e-03, -2.3887341e-03],
       ...,
       [ 3.7649382e-04, -3.6410286e-04,  1.4612420e-02, ...,
        -1.1922809e-03,  1.3411153e-02, -1.2046176e-02],
       [ 6.9277938e-03,  3.2389206e-03, -9.4464412e-03, ...,
         1.0154737e-02, -1.6668520e-05,  3.8765070e-03],
       [ 8.1081139e-03, -6.4555574e-03, -1.3942542e-03, ...,
         9.1293324e-03, -8.1743661e-04, -1.4563942e-02]], dtype=float32)>

In [19]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')

In [20]:
for inputs, targets in train_ds.take(1):
    print(inputs[:3], targets[:3])

tf.Tensor(
[[  75   24    5 1174   55  216]
 [ 321  203 1547    0    0    0]
 [   6   43  715   26  368    9]], shape=(3, 6), dtype=int64) tf.Tensor(
[[   2   16   27   14 1124  234]
 [   2   13  103   17 5188    3]
 [   2    8  583    6   30  213]], shape=(3, 6), dtype=int64)


In [21]:
inputs[:, 1]

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([  24,  203,   43,   12,  187,    5,  266,  127,  176,   27,  564,
         59,  642,  239, 3910,   65,   38,    5,   57,   10,  701,    8,
          3,  597, 1537,  266,   49, 3900,   37,    9, 3698,    8,  388,
        131,   43,   99,  985,   28,   99,  151,   16,  145, 9522,   35,
        116,  244,  871,  112,   99,   22,  248,  537,   17, 1928,  318,
       1219, 1409,   14,   44,  618,    5,  396,  170,  148])>

- Utilizar _Teacher forcing_

In [22]:
@tf.function
def train_step(inp_batch, tar_batch):
    loss = tf.constant(0.0)

    with tf.GradientTape() as tape:
        _, state = encoder(inp_batch, training=True)

        for step in range(0, tar_batch.shape[1] - 1):
            dec_inp = tf.expand_dims(tar_batch[:, step], 1)
            pred, state = decoder(dec_inp, state, 
                                  training=True)
            loss += loss_function(tar_batch[:, step + 1], pred)
        total_loss = loss / tar_batch.shape[1]
    weights = encoder.trainable_weights + decoder.trainable_weights
    gradients = tape.gradient(total_loss, weights)   
    opt.apply_gradients(zip(gradients, weights))
    train_loss_avg(total_loss)
    
train_step(inputs, targets)

In [23]:
epochs = 5

for epoch in range(epochs):
    for inputs, targets in train_ds:
        train_step(inputs, targets)
        
    print(f'Loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()

Loss: 3.368617057800293
Loss: 1.766330361366272
Loss: 1.0137767791748047
Loss: 0.6187133193016052
Loss: 0.4228934645652771


- Diccionario para recuperar palabras de índices

In [24]:
spa_vocab = spa_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))

In [25]:
inp = eng_vectorization(['i love my dog'])
_, state = encoder(inp)
dec_inp = spa_vectorization(['[start]'])[:, :1]
output = []
pred_index = ''

while pred_index != '[end]':
    pred, state = decoder(dec_inp, state, training=False)
    dec_inp = tf.argmax(pred, axis=-1)
    pred_index = spa_index_lookup[dec_inp[0][0].numpy()]
    output.append(pred_index)
    
' '.join(output[:-1])

'me encanta mi perro'

- El modelo original Seq-to-seq es difícil de entrenar en secuencias largas debido a los problemas de las redes recurrentes, es por esto que estos modelos se complementan con mecanismos de atención o se remplazan completamente por arquitecturas como Transformers.

## Ejercicio
- Remplazar GRU por LSTM.
- Agregar __test_step__ para monitoriear el entrenamiento sin _Teacher forcing_.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Modificar longitud de secuencia, arquitecture e hiperparámetros, agreagar mecanismo de atención de _Bahdanau_.