In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

## Importar dataset

In [2]:
import tensorflow_datasets as tfds

In [3]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [4]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [5]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## Preparar dataset

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [7]:
batch_size = 128
voc_size = 5000

train_ds = raw_train_ds.shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

test_ds = raw_test_ds.batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [8]:
for text, label in train_ds.take(1):
    print(text[0])

tf.Tensor(b'This film illustrates the worst part of surviving war, the memories. For many soldiers, men and women alike, returning home can be the beginning of real problems. I am reminded of my father and his brothers returning from WWII. For one of my uncles the war was never over. He survived the D-Day invasion, something akin to the first 20 minutes of Saving Private Ryan. For him the memories not only lingered but tortured him. He became an alcoholic as did several of my cousins, his sons. Jump ahead 60 years and place the soldiers in a different war, in a different country, the result is the same. When I saw this at the KC FilmFest, I was reminded that there are somethings about war that never change. The idealistic young men and women are not spared the emotional torment of what happened in Iraq, and especially if you are against the war you will come away with more compassion for the soldiers there trying to do what they believe or have been told is right.<br /><br />The tag li

## Tokenización

In [9]:
seq_length = 128
vectorize_layer = layers.TextVectorization(
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=seq_length)

- Adaptar la capa

In [10]:
vectorize_layer_ds = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(vectorize_layer_ds)

In [11]:
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

- Probar vectorize_layer con batch de prueba

In [12]:
test_batch = tf.constant([['Hi there']])
vectorize_layer(test_batch)

<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
array([[ 1, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>

## Definir Transformer

<img src="../img/linformer.png" width="700"/>

__Imagen tomada de Wang, S., Li, B. Z., Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

Dot-porduct attention:

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\label{eq:selfattention}
\end{equation}

Low-Rank attention:
\begin{align}
\text{head}_i &= \mbox{Attention}(QW_i^Q, E_iKW_i^K, F_iVW_i^V)\notag\\
&=\underbrace{\mbox{softmax}\left(\frac{QW_i^Q(E_iKW_i^K)^T}{\sqrt{d_k}}\right)}_{\bar{P}: n\times k}\cdot\underbrace{F_iVW_i^V}_{k\times d},\label{eq:linearattenion}
\end{align}

In [13]:
def scaled_dot_product(q, k, v):
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(dk)
    attn_weights = tf.nn.softmax(scaled_qk, axis=-1)  
    output = tf.matmul(attn_weights, v) 
    return output

In [14]:
class LinformerAttention(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads, k):
        super(LinformerAttention, self).__init__()
        self.n_heads = n_heads
        self.model_dim = model_dim

        assert model_dim % self.n_heads == 0

        self.depth = model_dim // self.n_heads
        
        self.wq = layers.Dense(model_dim)
        self.wk = layers.Dense(model_dim)
        self.wv = layers.Dense(model_dim)
        
        self.E = layers.Dense(k)
        self.F = layers.Dense(k)

        self.dense = layers.Dense(model_dim)

    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_into_heads(q, batch_size)  
        k = self.split_into_heads(k, batch_size)  
        v = self.split_into_heads(v, batch_size)  
        
        k = tf.transpose(self.E(tf.transpose(k, [0, 1, 3, 2])), [0, 1, 3, 2])
        v = tf.transpose(self.F(tf.transpose(v, [0, 1, 3, 2])), [0, 1, 3, 2])

        scaled_attention = scaled_dot_product(q, k, v)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) 
        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.model_dim)) 

        output = self.dense(original_size_attention) 
        return output
    
x = tf.ones([1, 256, 32])
LinformerAttention(32, 2, 64)(x, x, x)

<tf.Tensor: shape=(1, 256, 32), dtype=float32, numpy=
array([[[-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153],
        [-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153],
        [-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153],
        ...,
        [-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153],
        [-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153],
        [-0.33220053, -0.1177115 , -0.18873256, ...,  0.3294554 ,
         -0.25064963, -0.52764153]]], dtype=float32)>

- Definir embedding de posición

In [15]:
class TokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emb_dim, 
                 rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.max_len = maxlen
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim)
        self.position_emb = layers.Embedding(
            input_dim=maxlen, output_dim=emb_dim)
        self.dropout = layers.Dropout(rate)

    def call(self, x):
        token_embeddings = self.token_emb(x)
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = self.position_emb(positions)
        return self.dropout(token_embeddings + positions) 
    
pos_emb = TokenEmbedding(128, voc_size, 256)   
pos_emb(tf.ones([1, 128]))

<tf.Tensor: shape=(1, 128, 256), dtype=float32, numpy=
array([[[-0.03453249, -0.04783712, -0.00181229, ...,  0.04056312,
          0.07154232, -0.01234774],
        [-0.03350101, -0.01041274, -0.00781648, ...,  0.04535033,
          0.00481171, -0.09241173],
        [ 0.01315409, -0.08314388,  0.01213972, ...,  0.02761938,
          0.07785832, -0.06993081],
        ...,
        [-0.04500681, -0.06120592,  0.03584955, ...,  0.0241451 ,
          0.00501101, -0.00959498],
        [ 0.00099356,  0.00575273,  0.01831534, ...,  0.01458627,
          0.0823791 , -0.08451447],
        [-0.06487374, -0.07539803,  0.01486404, ...,  0.03923857,
          0.06017464, -0.01822667]]], dtype=float32)>

In [16]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads=2, mlp_dim=512, 
                 rate=0.0, eps=1e-6, k=64):
        super(TransformerBlock, self).__init__()
        self.attn = LinformerAttention(model_dim, n_heads, k)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'), 
            layers.Dense(model_dim),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=eps)
        self.norm2 = layers.LayerNormalization(epsilon=eps)
        self.drop1 = layers.Dropout(rate)
        self.drop2 = layers.Dropout(rate)

    def call(self, inputs, training):  
        attn_output = self.attn(inputs, inputs, inputs)
        attn_output = self.drop1(attn_output, training=training) 
        x_norm1 = self.norm1(attn_output + inputs)
        
        mlp_output = self.mlp(x_norm1)
        mlp_output = self.drop2(mlp_output, training=training)
        return self.norm2(mlp_output + x_norm1)
    
block = TransformerBlock(128)

In [17]:
class Transformer(tf.keras.models.Model):
    def __init__(self, model_dim, voc_size, mlp_dim=256, 
                 seq_length=128, heads=4):
        super(Transformer, self).__init__()
        self.emb = TokenEmbedding(seq_length, voc_size, model_dim)
        self.block = TransformerBlock(model_dim, heads, mlp_dim)
        self.out = tf.keras.Sequential([
            layers.GlobalAveragePooling1D(),
            layers.Dense(1)
        ])
    
    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x = self.block(x)
        x = self.out(x)
        return x
    
transformer = Transformer(128, voc_size)
transformer(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.27303243]], dtype=float32)>

In [18]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_embedding_1 (TokenEmb  multiple                 656384    
 edding)                                                         
                                                                 
 transformer_block_1 (Transf  multiple                 148992    
 ormerBlock)                                                     
                                                                 
 sequential_2 (Sequential)   (1, 1)                    129       
                                                                 
Total params: 805,505
Trainable params: 805,505
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento Transformer
- Utilizar los mismos parámteros de lstm

In [19]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [20]:
trans_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [21]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [22]:
epochs = 5

In [23]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, transformer.trainable_weights)
    trans_opt.apply_gradients(zip(gradients, transformer.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [24]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.6515284180641174
Val loss: 0.5990391969680786
Epoch: 1 Train loss: 0.5182429552078247
Val loss: 0.49494099617004395
Epoch: 2 Train loss: 0.38116446137428284
Val loss: 0.4060381352901459
Epoch: 3 Train loss: 0.3227705955505371
Val loss: 0.3978257477283478
Epoch: 4 Train loss: 0.2942725718021393
Val loss: 0.3904440402984619


## Ejercicio

- Modificar la arquitectura para obtener mejores resultados.