# Seq2seq
- En este notebook se define una arquitectura seq2seq para traducir oraciones del inglés al español.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import time

## 1.- Dataset

In [2]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [3]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [4]:
for _ in range(5):
    print(random.choice(text_pairs))

('This page has been intentionally left blank.', '[start] Esta página fue dejada en blanco intencionalmente. [end]')
('Would you eat the last cookie on the plate if other people were watching?', '[start] ¿Te comerías la última galletita del plato si otra persona estuviese mirando? [end]')
('We are men.', '[start] Somos hombres. [end]')
("I realized I wasn't ready.", '[start] Me di cuenta que no estaba lista. [end]')
('Tom injured himself when he jumped out of the window.', '[start] Tom se lastimó cuando saltó por la ventana. [end]')


In [5]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


## 2.- Pipeline

In [6]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
maxlen = 10
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", 
    output_sequence_length=maxlen,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=maxlen,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [7]:
eng_vectorization([['my name is'], ['my dog is']])

<tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[ 19, 233,   8,   0,   0,   0,   0,   0,   0,   0],
       [ 19, 173,   8,   0,   0,   0,   0,   0,   0,   0]])>

In [8]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [9]:
def preprocess(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return tf.reverse(eng, [1]), spa[:, :-1], spa[:, 1:]


def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    return dataset.shuffle(2048).prefetch(AUTOTUNE).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [10]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[0], inp_dec[0], tar_dec[0])

tf.Tensor([   0  487   12  495 1720  100  254    4  217    3], shape=(10,), dtype=int64) tf.Tensor([   2  399 1244   13 1120    6   17  574    4], shape=(9,), dtype=int64) tf.Tensor([ 399 1244   13 1120    6   17  574    4  101], shape=(9,), dtype=int64)


## 3.- Modelo

In [11]:
emb_dim = 256
model_dim = 512

### Encoder

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(voc_size,
                                                   emb_dim)
        self.gru = tf.keras.layers.GRU(model_dim,
                                       return_sequences=False,
                                       return_state=True)
        
    def call(self, x, state=None):
        x = self.embedding(x)
        x, state = self.gru(x, initial_state=state)
        return x, state
    
    
encoder = Encoder(eng_vectorization.vocabulary_size(),
                  emb_dim, model_dim)
output, enc_state = encoder(inp_enc)
enc_state

<tf.Tensor: shape=(64, 512), dtype=float32, numpy=
array([[ 0.01345986, -0.01282817,  0.00078651, ..., -0.00138945,
        -0.00193483,  0.00081795],
       [-0.00586681, -0.00865781,  0.00233197, ...,  0.00886651,
         0.01459937,  0.01042136],
       [-0.0097565 ,  0.01415568,  0.00180495, ..., -0.00564828,
        -0.00428695, -0.00428702],
       ...,
       [-0.01033887, -0.00555763,  0.00349164, ..., -0.00078032,
         0.00973689,  0.00733916],
       [-0.00175286, -0.0080623 ,  0.00452699, ...,  0.00399484,
         0.0069435 ,  0.01251333],
       [ 0.00117152,  0.00446815,  0.00635694, ...,  0.00548611,
         0.00730795,  0.01092208]], dtype=float32)>

In [13]:
output.shape

TensorShape([64, 512])

In [14]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  3073024   
                                                                 
 gru (GRU)                   multiple                  1182720   
                                                                 
Total params: 4,255,744
Trainable params: 4,255,744
Non-trainable params: 0
_________________________________________________________________


### Decoder

In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, x, states, return_state=False, training=False):
        x = self.embedding(x, training=training)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 


decoder = Decoder(voc_size=spa_vectorization.vocabulary_size(),
                  emb_dim=emb_dim,
                  model_dim=model_dim)

decoder(inp_dec[:, :1], enc_state)

<tf.Tensor: shape=(64, 1, 15000), dtype=float32, numpy=
array([[[-4.2371978e-03,  2.5792955e-03,  1.1927417e-03, ...,
          1.4199783e-03, -1.8674907e-03,  4.3898684e-04]],

       [[-3.1932073e-03, -3.2564742e-04, -2.0393071e-04, ...,
          1.0287821e-03,  2.3111093e-03,  1.7360961e-03]],

       [[-1.0320020e-03, -2.5059234e-03,  1.3493659e-03, ...,
          1.5876164e-03,  1.4532279e-03,  1.5037364e-03]],

       ...,

       [[-3.3585376e-03, -1.5357733e-03,  1.7665242e-04, ...,
          8.2846957e-05, -2.3197941e-03,  9.5731323e-04]],

       [[-3.1409599e-03, -9.0502808e-04,  1.8727144e-03, ...,
          8.9994294e-04, -2.2213692e-03, -4.5712304e-04]],

       [[-3.5127893e-03, -6.7628070e-04,  1.7415372e-03, ...,
         -9.8129280e-04, -3.4677403e-03,  4.5821469e-04]]], dtype=float32)>

In [16]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  3840000   
                                                                 
 gru_1 (GRU)                 multiple                  1182720   
                                                                 
 dense (Dense)               multiple                  7695000   
                                                                 
Total params: 12,717,720
Trainable params: 12,717,720
Non-trainable params: 0
_________________________________________________________________


## 4.- Entrenamiento

In [17]:
optimizer = tf.keras.optimizers.Adam(0.001)

def loss_function(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

In [18]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [19]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[:3], inp_dec[:3], tar_dec[:3])

tf.Tensor(
[[   0    0 1743    4 6271  492   43  743  729    2]
 [   0    0   90  789    2   10  349  925    8  152]
 [   0    0    0    0    0 3384    7  913    8    6]], shape=(3, 10), dtype=int64) tf.Tensor(
[[   2   19  417  424  264 2664    1   22 2140]
 [   2  809   39  238    9  582   11    9  565]
 [   2   94  296   12 5434    3    0    0    0]], shape=(3, 9), dtype=int64) tf.Tensor(
[[  19  417  424  264 2664    1   22 2140    3]
 [ 809   39  238    9  582   11    9  565   11]
 [  94  296   12 5434    3    0    0    0    0]], shape=(3, 9), dtype=int64)


In [20]:
_, state = encoder(inp_enc, training=True)
state.shape, inp_dec.shape, tar_dec.shape

(TensorShape([64, 512]), TensorShape([64, 9]), TensorShape([64, 9]))

In [21]:
@tf.function
def train_step(inp_enc, inp_dec, tar_dec):
    with tf.GradientTape() as tape:
        _, state = encoder(inp_enc, training=True)
        pred = decoder(inp_dec, state, training=True)
        loss_value = loss_function(tar_dec, pred)
        
    weights = encoder.trainable_weights + decoder.trainable_weights
    gradients = tape.gradient(loss_value, weights)
    optimizer.apply_gradients(zip(gradients, weights))
    train_loss(loss_value)

In [22]:
ids_to_text = tf.keras.layers.StringLookup(
                vocabulary=spa_vectorization.get_vocabulary(),
                mask_token='',
                invert=True)

In [23]:
sentences = ['i love my dog',
             'i love to sleep',
             'the cat wants to eat']

def print_translation(sentence):
    inp = eng_vectorization([sentence])
    inp = tf.reverse(inp, [1])
    _, state = encoder(inp, training=False)
    dec_inp = spa_vectorization(['[start]'])[:, :1]
    output = []
    pred_index = ''

    while pred_index != '[end]':
        logits, state = decoder(dec_inp, state, return_state=True, training=False)
        dec_inp = tf.argmax(logits, axis=-1)
        pred_index = ids_to_text(dec_inp)
        output.append(pred_index[0][0].numpy().decode('utf-8'))

    text = ' '.join(output[:-1])
    print(f'Input: {sentence}')
    print(f'Prediction: {text}')

In [24]:
epochs = 7

for epoch in range(1, epochs):
    start = time.time()
    for inp_enc, inp_dec, tar_dec in train_ds:
        train_step(inp_enc, inp_dec, tar_dec)
        
    print(f'\nTime taken for epoch {epoch} is: {time.time() - start:.2f} secs', end=' ')
    print(f'Loss: {train_loss.result():.4f}')
    train_loss.reset_states()
    
    for s in sentences:
        print_translation(s)


Time taken for epoch 1 is: 19.94 secs Loss: 5.0653
Input: i love my dog
Prediction: no puedo creer que tom no es tan bueno
Input: i love to sleep
Prediction: no puedo creer que tom no es tan bueno
Input: the cat wants to eat
Prediction: el tren se detuvo

Time taken for epoch 2 is: 7.22 secs Loss: 3.7809
Input: i love my dog
Prediction: me gusta el helado de tom
Input: i love to sleep
Prediction: yo estaba [UNK]
Input: the cat wants to eat
Prediction: el hombre se detuvo a su madre

Time taken for epoch 3 is: 7.29 secs Loss: 2.7501
Input: i love my dog
Prediction: me encanta el helado
Input: i love to sleep
Prediction: me encanta la sandía
Input: the cat wants to eat
Prediction: el perro quiere ir

Time taken for epoch 4 is: 7.62 secs Loss: 1.9537
Input: i love my dog
Prediction: adoro mi perro
Input: i love to sleep
Prediction: me encanta dormir
Input: the cat wants to eat
Prediction: el perro quiere ir

Time taken for epoch 5 is: 6.62 secs Loss: 1.4104
Input: i love my dog
Predictio

## Ejercicio
- Agregar loop de evaluación.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.