In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

## Importar dataset

In [2]:
import tensorflow_datasets as tfds

In [3]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [4]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [5]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## Preparar dataset

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [7]:
batch_size = 128
voc_size = 5000

train_ds = raw_train_ds.shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

val_ds = raw_test_ds.batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [8]:
for text, label in train_ds.take(1):
    print(text[0])

tf.Tensor(b"What is most disturbing about this film is not that school killing sprees like the one depicted actually happen, but that the truth is they are carried out by teenagers like Cal and Andre...normal kids with normal families. By using a hand held camera technique a la Blair Witch, Ben Coccio succeeds in bringing us into the lives of two friends who have some issues with high school, although we aren't ever told exactly what is behind those issues. They seem to be typical -a lot of people hate high school, so what? A part of you just doesn't believe they will ever carry out the very well thought out massacre on Zero Day. The surveillance camera scenes in the school during the shooting are made all the more powerful for that reason. You can't believe it's really happening, and that it's really happened. The hand held camera technique also creates the illusion that this is not a scripted movie, a brilliant idea given the subject matter.", shape=(), dtype=string)


## Tokenización

In [9]:
seq_length = 128
vectorize_layer = layers.TextVectorization(
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=seq_length)

- Adaptar la capa

In [10]:
vectorize_layer_ds = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(vectorize_layer_ds)

In [11]:
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

- Probar vectorize_layer con batch de prueba

In [12]:
test_batch = tf.constant([['Hi there']])
vectorize_layer(test_batch)

<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
array([[ 1, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>

## Definir Transformer

<img src="../img/linformer.png" width="700"/>

__Imagen tomada de Wang, S., Li, B. Z., Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

Dot-porduct attention:

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\label{eq:selfattention}
\end{equation}

Low-Rank attention:
\begin{align}
\text{head}_i &= \mbox{Attention}(QW_i^Q, E_iKW_i^K, F_iVW_i^V)\notag\\
&=\underbrace{\mbox{softmax}\left(\frac{QW_i^Q(E_iKW_i^K)^T}{\sqrt{d_k}}\right)}_{\bar{P}: n\times k}\cdot\underbrace{F_iVW_i^V}_{k\times d},\label{eq:linearattenion}
\end{align}

In [13]:
def scaled_dot_product(q, k, v):
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(dk)
    attn_weights = tf.nn.softmax(scaled_qk, axis=-1)  
    output = tf.matmul(attn_weights, v) 
    return output

In [17]:
class LinformerAttention(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads, k):
        super(LinformerAttention, self).__init__()
        self.n_heads = n_heads
        self.model_dim = model_dim

        assert model_dim % self.n_heads == 0

        self.depth = model_dim // self.n_heads
        
        self.wq = layers.Dense(model_dim)
        self.wk = layers.Dense(model_dim)
        self.wv = layers.Dense(model_dim)
        
        self.E = layers.Dense(k)
        self.F = layers.Dense(k)

        self.dense = layers.Dense(model_dim)

    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_into_heads(q, batch_size)  
        k = self.split_into_heads(k, batch_size)  
        v = self.split_into_heads(v, batch_size)  
        
        k = tf.transpose(self.E(tf.transpose(k, [0, 1, 3, 2])), [0, 1, 3, 2])
        v = tf.transpose(self.F(tf.transpose(v, [0, 1, 3, 2])), [0, 1, 3, 2])

        scaled_attention = scaled_dot_product(q, k, v)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) 
        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.model_dim)) 

        output = self.dense(original_size_attention) 
        return output
    

x = tf.ones([1, 256, 32])
LinformerAttention(32, 2, 64)(x, x, x)

<tf.Tensor: shape=(1, 256, 32), dtype=float32, numpy=
array([[[-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ],
        [-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ],
        [-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ],
        ...,
        [-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ],
        [-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ],
        [-0.24511753, -0.0791252 , -0.10442789, ..., -0.22079368,
          0.313533  , -0.3346887 ]]], dtype=float32)>

- Definir embedding de posición

In [18]:
class TokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emb_dim, 
                 rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.max_len = maxlen
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim)
        self.position_emb = layers.Embedding(
            input_dim=maxlen, output_dim=emb_dim)
        self.dropout = layers.Dropout(rate)

    def call(self, x):
        token_embeddings = self.token_emb(x)
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = self.position_emb(positions)
        return self.dropout(token_embeddings + positions) 
    
pos_emb = TokenEmbedding(128, voc_size, 256)   
pos_emb(tf.ones([1, 128]))

<tf.Tensor: shape=(1, 128, 256), dtype=float32, numpy=
array([[[-0.00509334,  0.02292543,  0.0129603 , ...,  0.02890614,
         -0.06115935, -0.03547627],
        [ 0.07007627, -0.07065073, -0.01966025, ...,  0.0226925 ,
         -0.04687798,  0.03227063],
        [ 0.03769179, -0.01679782,  0.00937578, ...,  0.08238624,
         -0.03400761, -0.00760723],
        ...,
        [ 0.03905872,  0.01341741, -0.07383601, ...,  0.09142151,
         -0.05027041,  0.01305287],
        [ 0.00667087, -0.05134981,  0.01395455, ...,  0.04614391,
         -0.00087783,  0.02853852],
        [ 0.00494627, -0.06228298, -0.06462884, ...,  0.0880885 ,
          0.03035603,  0.02115138]]], dtype=float32)>

In [19]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads=2, mlp_dim=512, 
                 rate=0.0, eps=1e-6, k=64):
        super(TransformerBlock, self).__init__()
        self.attn = LinformerAttention(model_dim, n_heads, k)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'), 
            layers.Dense(model_dim),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=eps)
        self.norm2 = layers.LayerNormalization(epsilon=eps)
        self.drop1 = layers.Dropout(rate)
        self.drop2 = layers.Dropout(rate)

    def call(self, inputs, training):  
        attn_output = self.attn(inputs, inputs, inputs)
        attn_output = self.drop1(attn_output, training=training) 
        x_norm1 = self.norm1(attn_output + inputs)
        
        mlp_output = self.mlp(x_norm1)
        mlp_output = self.drop2(mlp_output, training=training)
        return self.norm2(mlp_output + x_norm1)
    
block = TransformerBlock(128)

In [20]:
class Transformer(tf.keras.models.Model):
    def __init__(self, model_dim, voc_size, mlp_dim=256, 
                 seq_length=128, heads=4):
        super(Transformer, self).__init__()
        self.emb = TokenEmbedding(seq_length, voc_size, model_dim)
        self.block = TransformerBlock(model_dim, heads, mlp_dim)
        self.out = tf.keras.Sequential([
            layers.GlobalAveragePooling1D(),
            layers.Dense(1)
        ])
    
    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x = self.block(x)
        x = self.out(x)
        return x
    
transformer = Transformer(128, voc_size)
transformer(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.7470248]], dtype=float32)>

In [21]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_embedding_2 (TokenEmb  multiple                 656384    
 edding)                                                         
                                                                 
 transformer_block_2 (Transf  multiple                 148992    
 ormerBlock)                                                     
                                                                 
 sequential_2 (Sequential)   (1, 1)                    129       
                                                                 
Total params: 805,505
Trainable params: 805,505
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento Transformer
- Utilizar los mismos parámteros de lstm

In [22]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [23]:
trans_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [24]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='train_loss')

In [25]:
epochs = 5

In [26]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, transformer.trainable_weights)
    trans_opt.apply_gradients(zip(gradients, transformer.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def val_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [27]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()
    
    for text, target in val_ds:
        val_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.6602440476417542
Val loss: 0.6189302802085876
Epoch: 1 Train loss: 0.5468369126319885
Val loss: 0.5205957293510437
Epoch: 2 Train loss: 0.41801717877388
Val loss: 0.4163615107536316
Epoch: 3 Train loss: 0.33927294611930847
Val loss: 0.4019140601158142
Epoch: 4 Train loss: 0.30369260907173157
Val loss: 0.4012596011161804


## Ejercicio

- Modificar la arquitectura para mejorar los resultados del modelo.