# Seq2seq
- En este notebook se define una arquitectura seq2seq para traducir oraciones del inglés al español.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import time

## 1.- Dataset

In [2]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [3]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [4]:
for _ in range(5):
    print(random.choice(text_pairs))

('Tom collected stamps.', '[start] Tom coleccionaba sellos. [end]')
('He repeated his question.', '[start] Él repitió su pregunta. [end]')
("I understand the sentence, but I'm not able to translate it.", '[start] Yo entiendo la frase, pero no logro traducirla. [end]')
('Tom persuaded Mary to help John.', '[start] Tom persuadió a Mary para ayudar a John. [end]')
('His jokes are not funny at all.', '[start] Sus chistes no son para nada graciosos. [end]')


In [5]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


## 2.- Pipeline

In [6]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
maxlen = 10
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", 
    output_sequence_length=maxlen,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=maxlen,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [7]:
eng_vectorization([['my name is'], ['my dog is']])

<tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[ 19, 235,   8,   0,   0,   0,   0,   0,   0,   0],
       [ 19, 178,   8,   0,   0,   0,   0,   0,   0,   0]])>

In [8]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [9]:
def preprocess(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return tf.reverse(eng, [1]), spa[:, :-1], spa[:, 1:]


def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    return dataset.shuffle(2048).prefetch(AUTOTUNE).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [10]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[0], inp_dec[0], tar_dec[0])

tf.Tensor([   0    0    0    0    0    0 1023    2    5   23], shape=(10,), dtype=int64) tf.Tensor([  2 132  10 987   3   0   0   0   0], shape=(9,), dtype=int64) tf.Tensor([132  10 987   3   0   0   0   0   0], shape=(9,), dtype=int64)


## 3.- Modelo

In [11]:
emb_dim = 256
model_dim = 1024

### Encoder

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(voc_size,
                                                   emb_dim)
        self.gru = tf.keras.layers.GRU(model_dim,
                                       return_sequences=False,
                                       return_state=True)
        
    def call(self, x, state=None):
        x = self.embedding(x)
        x, state = self.gru(x, initial_state=state)
        return x, state
    
    
encoder = Encoder(eng_vectorization.vocabulary_size(),
                  emb_dim, model_dim)
output, enc_state = encoder(inp_enc)
enc_state

<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
array([[ 0.00452388,  0.0023647 , -0.0015519 , ...,  0.00635016,
         0.00875575,  0.01872421],
       [ 0.00123157, -0.00115991, -0.00740096, ..., -0.00825757,
        -0.00251782, -0.00243315],
       [-0.00630107, -0.00409868, -0.00735846, ...,  0.00684062,
         0.00467536, -0.00659731],
       ...,
       [-0.00764129, -0.01092764,  0.00017454, ...,  0.01025986,
        -0.00862891, -0.01394198],
       [-0.00279094, -0.00529953, -0.00042023, ..., -0.00011713,
         0.00366927, -0.00757017],
       [ 0.00152143,  0.00721026, -0.00376425, ..., -0.00796608,
         0.00185755,  0.00385784]], dtype=float32)>

In [13]:
output.shape

TensorShape([64, 1024])

In [14]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  3092992   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
Total params: 7,031,296
Trainable params: 7,031,296
Non-trainable params: 0
_________________________________________________________________


### Decoder

In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, x, states, return_state=False, training=False):
        x = self.embedding(x, training=training)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 


decoder = Decoder(voc_size=spa_vectorization.vocabulary_size(),
                  emb_dim=emb_dim,
                  model_dim=model_dim)

decoder(inp_dec[:, :1], enc_state)

<tf.Tensor: shape=(64, 1, 15000), dtype=float32, numpy=
array([[[-8.50444310e-04, -7.19906110e-03,  2.85117159e-04, ...,
         -1.43465970e-03,  8.48916388e-05,  2.58170650e-03]],

       [[-7.43264391e-04, -5.14982687e-03, -1.91092945e-03, ...,
         -1.62706245e-03, -9.96784511e-05,  2.18255562e-03]],

       [[-2.51733552e-04, -4.67995415e-03, -5.42414316e-04, ...,
         -2.83832848e-03,  2.62653339e-03,  2.64436705e-03]],

       ...,

       [[-1.07496955e-04, -5.61239943e-03, -1.89118064e-03, ...,
         -3.06227896e-03,  1.09169065e-04,  3.16094514e-03]],

       [[-1.70495559e-03, -5.15167974e-03, -2.47775181e-03, ...,
         -2.03523342e-03, -1.08800930e-04,  3.08055524e-03]],

       [[-1.09492219e-03, -4.69784671e-03, -1.91600493e-03, ...,
         -1.51437439e-03, -4.36020200e-04,  3.40514031e-04]]],
      dtype=float32)>

In [16]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  3840000   
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  15375000  
                                                                 
Total params: 23,153,304
Trainable params: 23,153,304
Non-trainable params: 0
_________________________________________________________________


## 4.- Entrenamiento

In [17]:
optimizer = tf.keras.optimizers.Adam(0.001)

def loss_function(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

In [18]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [19]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[:3], inp_dec[:3], tar_dec[:3])

tf.Tensor(
[[   0    0  841 1071   20  404   30    4   82    5]
 [   0    0    0    0    0    0  180   55 5107    3]
 [   0    0    0    0   45   41    4 3092    8    9]], shape=(3, 10), dtype=int64) tf.Tensor(
[[   2   92    5  126  511   16  387  303    5]
 [   2   14  810   55    3    0    0    0    0]
 [   2   22 3701   16   63    3    0    0    0]], shape=(3, 9), dtype=int64) tf.Tensor(
[[  92    5  126  511   16  387  303    5 3470]
 [  14  810   55    3    0    0    0    0    0]
 [  22 3701   16   63    3    0    0    0    0]], shape=(3, 9), dtype=int64)


In [20]:
_, state = encoder(inp_enc, training=True)
state.shape, inp_dec.shape, tar_dec.shape

(TensorShape([64, 1024]), TensorShape([64, 9]), TensorShape([64, 9]))

In [21]:
@tf.function
def train_step(inp_enc, inp_dec, tar_dec):
    with tf.GradientTape() as tape:
        _, state = encoder(inp_enc, training=True)
        pred = decoder(inp_dec, state, training=True)
        loss_value = loss_function(tar_dec, pred)
        
    weights = encoder.trainable_weights + decoder.trainable_weights
    gradients = tape.gradient(loss_value, weights)
    optimizer.apply_gradients(zip(gradients, weights))
    train_loss(loss_value)

In [22]:
ids_to_text = tf.keras.layers.StringLookup(
                vocabulary=spa_vectorization.get_vocabulary(),
                mask_token='',
                invert=True)

In [23]:
sentences = ['i love my dog',
             'i love to sleep',
             'the cat wants to eat']

def generate_translation(sentence):
    inp = eng_vectorization([sentence])
    inp = tf.reverse(inp, [1])
    _, state = encoder(inp, training=False)
    dec_inp = spa_vectorization(['[start]'])[:, :1]
    output = []
    pred_index = ''

    while pred_index != '[end]':
        logits, state = decoder(dec_inp, state, return_state=True, training=False)
        dec_inp = tf.argmax(logits, axis=-1)
        pred_index = ids_to_text(dec_inp)
        output.append(pred_index[0][0].numpy().decode('utf-8'))

    text = ' '.join(output[:-1])
    return text

In [24]:
epochs = 5

for epoch in range(1, epochs):
    start = time.time()
    for inp_enc, inp_dec, tar_dec in train_ds:
        train_step(inp_enc, inp_dec, tar_dec)
        
    print(f'Time taken for epoch {epoch} is: {time.time() - start:.2f} secs', end=' ')
    print(f'Loss: {train_loss.result():.4f}')
    train_loss.reset_states()
    
    print('Output: ')
    for s in sentences:
        trans = generate_translation(s)
        print(f"{trans}")

Time taken for epoch 1 is: 23.94 secs Loss: 4.3734
Output: 
me gusta mi perro
me encantan las galletas
la policía quiere que [UNK]
Time taken for epoch 2 is: 10.79 secs Loss: 2.5515
Output: 
me encanta mi perro
me encantan las noches
el policía quiere comer
Time taken for epoch 3 is: 10.75 secs Loss: 1.5755
Output: 
amo a mi perro
me gusta dormir
el gato quiere comer
Time taken for epoch 4 is: 10.83 secs Loss: 0.9843
Output: 
amo a mi perro
me gusta dormir
el gato quiere comer


## Ejercicio
- Agregar loop de evaluación.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.