In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

## Importar dataset

In [2]:
import tensorflow_datasets as tfds

In [3]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [4]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [5]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## Preparar dataset

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [7]:
batch_size = 128
voc_size = 5000

train_ds = raw_train_ds.shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

test_ds = raw_test_ds.batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [8]:
for text, label in train_ds.take(1):
    print(text[0])

tf.Tensor(b"Gwyneth Paltrow is absolutely great in this movie, but the story is, unfortunately, half-baked, and David Schwimmer's energy is sort of like cold mush. When he closes his mouth and gets serious for a moment or two there is a rush of what-might-have-been. Who thought 25-year-old kiddies would be entertaining?", shape=(), dtype=string)


## Tokenización

In [9]:
seq_length = 128
vectorize_layer = layers.TextVectorization(
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=seq_length)

- Adaptar la capa

In [10]:
vectorize_layer_ds = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(vectorize_layer_ds)

In [11]:
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

- Probar vectorize_layer con batch de prueba

In [12]:
test_batch = tf.constant([['Hi there']])
vectorize_layer(test_batch)

<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
array([[ 1, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>

## Definir LSTM

RNN:
\begin{equation}
h_t = f(Wx_t + Uh_{t-1} + b)
\end{equation}



LSTM:

\begin{align}
i_t & = \sigma(W^ix_t + U^ih_{t-1} + b^i) \\
f_t & = \sigma(W^fx_t + U^fh_{t-1} + b^f) \\
o_t & = \sigma(W^ox_t + U^oh_{t-1} + b^o) \\
g_t & = \text{tanh}(W^gx_t + U^gh_{t-1} + b^g) \\
c_t & = f_t \odot c_{t-1} + i_t \odot g_t\\
h_t & = o_t \odot \text{tanh}(c_t) \\
\end{align}

In [13]:
lstm = tf.keras.Sequential([
    vectorize_layer,
    layers.Embedding(
        input_dim=voc_size, output_dim=128),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

- Probar LSTM con batch de prueba

In [14]:
lstm(test_batch)

Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00374339]], dtype=float32)>

- Información del modelo

In [15]:
lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 128)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 128, 128)          640000    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 919,681
Trainable params: 919,681
Non-trai

## Entrenamiento LSTM

In [16]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [17]:
lstm_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [18]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [19]:
epochs = 5

In [20]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = lstm(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, lstm.trainable_weights)
    lstm_opt.apply_gradients(zip(gradients, lstm.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = lstm(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [21]:
for text, target in train_ds.take(1):
    print(target)

tf.Tensor(
[1 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0
 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1
 1 1 0 0 1 1 1 0 0 1 1 1 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1
 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 1 1], shape=(128,), dtype=int64)


In [22]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.6462168097496033
Val loss: 0.489723801612854
Epoch: 1 Train loss: 0.40697240829467773
Val loss: 0.3893091380596161
Epoch: 2 Train loss: 0.32604965567588806
Val loss: 0.38884618878364563
Epoch: 3 Train loss: 0.29338598251342773
Val loss: 0.38962098956108093
Epoch: 4 Train loss: 0.2743459939956665
Val loss: 0.3940851390361786


## Definir Transformer

<img src="../img/dot_product.png" width="500"/>

__Imagen tomada de Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\label{eq:selfattention}
\end{equation}

In [23]:
def scaled_dot_product(q, k, v):
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(dk)
    attn_weights = tf.nn.softmax(scaled_qk, axis=-1)  
    output = tf.matmul(attn_weights, v) 
    return output

In [24]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.model_dim = model_dim

        assert model_dim % self.n_heads == 0

        self.depth = model_dim // self.n_heads

        self.wq = layers.Dense(model_dim)
        self.wk = layers.Dense(model_dim)
        self.wv = layers.Dense(model_dim)

        self.dense = layers.Dense(model_dim)

    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_into_heads(q, batch_size)  
        k = self.split_into_heads(k, batch_size)  
        v = self.split_into_heads(v, batch_size)  

        scaled_attention = scaled_dot_product(q, k, v)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) 
        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.model_dim)) 

        output = self.dense(original_size_attention) 
        return output

- Definir embedding de posición

In [25]:
class TokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emb_dim, 
                 rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.max_len = maxlen
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim)
        self.position_emb = layers.Embedding(
            input_dim=maxlen, output_dim=emb_dim)
        self.dropout = layers.Dropout(rate)

    def call(self, x):
        token_embeddings = self.token_emb(x)
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = self.position_emb(positions)
        return self.dropout(token_embeddings + positions) 
    
pos_emb = TokenEmbedding(128, voc_size, 256)   
pos_emb(tf.ones([1, 128]))

<tf.Tensor: shape=(1, 128, 256), dtype=float32, numpy=
array([[[ 0.07766364, -0.0579814 ,  0.02177919, ...,  0.06702017,
          0.05156231,  0.01449088],
        [ 0.08085793,  0.02139024, -0.05316234, ...,  0.03676517,
          0.06496051,  0.03297361],
        [ 0.04436861, -0.03274007, -0.02505894, ...,  0.03902562,
          0.07708043,  0.04683213],
        ...,
        [ 0.02859744,  0.02100952, -0.01246803, ...,  0.01458673,
          0.00282642, -0.0247736 ],
        [ 0.01724117, -0.02486933,  0.00193879, ...,  0.0335523 ,
          0.0177079 , -0.00016836],
        [ 0.07892369,  0.00209139, -0.06094391, ...,  0.03320872,
          0.00024458, -0.01195301]]], dtype=float32)>

In [26]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads=2, mlp_dim=512, 
                 rate=0.0, eps=1e-6):
        super(TransformerBlock, self).__init__()
        self.attn = MultiHeadAttention(model_dim, n_heads)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'), 
            layers.Dense(model_dim),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=eps)
        self.norm2 = layers.LayerNormalization(epsilon=eps)
        self.drop1 = layers.Dropout(rate)
        self.drop2 = layers.Dropout(rate)

    def call(self, inputs, training):  
        attn_output = self.attn(inputs, inputs, inputs)
        attn_output = self.drop1(attn_output, training=training) 
        x_norm1 = self.norm1(attn_output + inputs)
        
        mlp_output = self.mlp(x_norm1)
        mlp_output = self.drop2(mlp_output, training=training)
        return self.norm2(mlp_output + x_norm1)
    
block = TransformerBlock(128)

In [27]:
class Transformer(tf.keras.models.Model):
    def __init__(self, model_dim, voc_size, mlp_dim=256, 
                 seq_length=128, heads=4):
        super(Transformer, self).__init__()
        self.emb = TokenEmbedding(seq_length, voc_size, model_dim)
        self.block = TransformerBlock(model_dim, heads, mlp_dim)
        self.out = tf.keras.Sequential([
            layers.GlobalAveragePooling1D(),
            layers.Dense(1)
        ])
    
    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x = self.block(x)
        x = self.out(x)
        return x
    
transformer = Transformer(128, voc_size)
transformer(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[1.4868903]], dtype=float32)>

In [28]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_embedding_1 (TokenEmb  multiple                 656384    
 edding)                                                         
                                                                 
 transformer_block_1 (Transf  multiple                 132480    
 ormerBlock)                                                     
                                                                 
 sequential_3 (Sequential)   (1, 1)                    129       
                                                                 
Total params: 788,993
Trainable params: 788,993
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento Transformer
- Utilizar los mismos parámteros de lstm

In [29]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [30]:
trans_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [31]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [32]:
epochs = 5

In [33]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, transformer.trainable_weights)
    trans_opt.apply_gradients(zip(gradients, transformer.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [34]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.5820030570030212
Val loss: 0.44539397954940796
Epoch: 1 Train loss: 0.38116198778152466
Val loss: 0.38284048438072205
Epoch: 2 Train loss: 0.32424184679985046
Val loss: 0.38564532995224
Epoch: 3 Train loss: 0.2971893548965454
Val loss: 0.38352257013320923
Epoch: 4 Train loss: 0.2794248163700104
Val loss: 0.3927634060382843


## Ejercicio

- Modificar hiperparámetros de los modelos para obtener mejores resultados.
- Modificar las arquitecturas, comparar resultados con GRU.
- Agregar y modificar regularización.