In [1]:
# The MIT License (MIT) Copyright (c) 2023 milmor
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Linformer

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

## 1.- Dataset

In [3]:
import tensorflow_datasets as tfds

In [4]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [5]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [6]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## 2.- Pipeline

In [7]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [8]:
batch_size = 128
voc_size = 5000

train_ds = raw_train_ds.shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

test_ds = raw_test_ds.batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [9]:
for text, label in train_ds.take(1):
    print(text[0])

tf.Tensor(b"Domini Enfilren (Marlene Dietrich) has spent most of her life caring for her father. Now that he has died she is free--but doesn't know what she wants. Boris Androvsky (Charles Boyer) is a monk who has fled a monastery to taste more of life. They meet accidentally in Algiers, fall in love and get married. But he can't leave his past behind and she can't live without him...<br /><br />WAY overdone romance full of hysterically bad dialogue and situations. Dietrich and Boyer do their best to give good performances but NOBODY could get away with some of their lines! Still, in a way, it is a classic. It's shot in gorgeous Technicolor (try to see it on DVD) where every frame is breath-takingly beautiful. Dietrich is always dressed to the 9s (even in the middle of the desert) and strikes hysterical poses to show off the clothes and her body. Boyer just walks around looking stricken (no shock there). Still I was never bored. It was wonderful to look at and the non-stop stupid dialo

### Tokenización

In [10]:
seq_length = 128
vectorize_layer = layers.TextVectorization(
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=seq_length)

In [11]:
vectorize_layer_ds = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(vectorize_layer_ds)

In [12]:
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

In [13]:
test_batch = tf.constant([['Hi there']])
vectorize_layer(test_batch)

<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
array([[ 1, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>

## 3.- Modelo

<img src="../img/linformer.png" width="700"/>

__Imagen tomada de Wang, S., Li, B. Z., Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

Dot-porduct attention:

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\label{eq:selfattention}
\end{equation}

Low-Rank attention:
\begin{align}
\text{head}_i &= \mbox{Attention}(QW_i^Q, E_iKW_i^K, F_iVW_i^V)\notag\\
&=\underbrace{\mbox{softmax}\left(\frac{QW_i^Q(E_iKW_i^K)^T}{\sqrt{d_k}}\right)}_{\bar{P}: n\times k}\cdot\underbrace{F_iVW_i^V}_{k\times d},\label{eq:linearattenion}
\end{align}

In [14]:
class LinformerAttention(layers.Layer):
    def __init__(self, model_dim, n_heads, k, rate=0.1, initializer='glorot_uniform'):
        super(LinformerAttention, self).__init__()
        self.n_heads = n_heads
        self.model_dim = model_dim

        assert model_dim % self.n_heads == 0

        self.head_dim = model_dim // self.n_heads

        self.wq = layers.Dense(model_dim, kernel_initializer=initializer)
        self.wk = layers.Dense(model_dim, kernel_initializer=initializer)
        self.wv = layers.Dense(model_dim, kernel_initializer=initializer)
        
        self.E = layers.Dense(k)
        self.F = layers.Dense(k)

        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
        self.wo = layers.Dense(model_dim, kernel_initializer=initializer)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.head_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_heads(q, batch_size) 
        k = self.split_heads(k, batch_size)  
        v = self.split_heads(v, batch_size) 
        
        k = tf.transpose(self.E(tf.transpose(k, [0, 1, 3, 2])), [0, 1, 3, 2])
        v = tf.transpose(self.F(tf.transpose(v, [0, 1, 3, 2])), [0, 1, 3, 2])

        dh = tf.cast(self.head_dim, tf.float32)
        qk = tf.matmul(q, k, transpose_b=True)
        scaled_qk =  qk / tf.math.sqrt(dh)

        attn = self.dropout1(tf.nn.softmax(scaled_qk, axis=-1))
        attn = tf.matmul(attn, v) 

        attn = tf.transpose(attn, perm=[0, 2, 1, 3]) 
        original_size_attention = tf.reshape(attn, (batch_size, -1, self.model_dim)) 

        output = self.dropout2(self.wo(original_size_attention))
        return output
    
x = tf.ones([1, 256, 32])
LinformerAttention(32, 2, 64)(x, x, x)

<tf.Tensor: shape=(1, 256, 32), dtype=float32, numpy=
array([[[-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785],
        [-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785],
        [-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785],
        ...,
        [-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785],
        [-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785],
        [-0.19955666, -0.03870107, -0.02954612, ..., -0.2653634 ,
         -0.06839337, -0.04166785]]], dtype=float32)>

### Transformer block

In [15]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads=2, mlp_dim=512, 
                 rate=0.0, eps=1e-6, k=64):
        super(TransformerBlock, self).__init__()
        self.attn = LinformerAttention(model_dim, n_heads, k)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'), 
            layers.Dense(model_dim),
            layers.Dropout(rate)
        ])
        self.ln1 = layers.LayerNormalization(epsilon=eps)
        self.ln2 = layers.LayerNormalization(epsilon=eps)
        self.drop1 = layers.Dropout(rate)

    def call(self, inputs, training):  
        x = self.drop1(self.attn(inputs, inputs, inputs), training=training) 
        x = self.ln1(x + inputs)
        return self.ln2(self.mlp(x) + x)
    
block = TransformerBlock(128)

### Positional embedding

In [16]:
class TokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emb_dim, 
                 rate=0.0):
        super(TokenEmbedding, self).__init__()
        self.max_len = maxlen
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim)
        self.position_emb = layers.Embedding(
            input_dim=maxlen, output_dim=emb_dim)
        self.dropout = layers.Dropout(rate)

    def call(self, x):
        token_embeddings = self.token_emb(x)
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = self.position_emb(positions)
        return self.dropout(token_embeddings + positions) 

### Transformer

In [17]:
class Transformer(tf.keras.models.Model):
    def __init__(self, model_dim, voc_size, mlp_dim=256, 
                 maxlen=128, heads=4):
        super(Transformer, self).__init__()
        self.emb = TokenEmbedding(maxlen, voc_size, model_dim)
        self.block = TransformerBlock(model_dim, heads, mlp_dim)
        self.out = tf.keras.Sequential([
            layers.GlobalAveragePooling1D(),
            layers.Dense(1)
        ])
    
    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x = self.block(x)
        x = self.out(x)
        return x
    
transformer = Transformer(128, voc_size)
transformer(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.2790028]], dtype=float32)>

In [18]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_embedding (TokenEmbed  multiple                 656384    
 ding)                                                           
                                                                 
 transformer_block_1 (Transf  multiple                 148992    
 ormerBlock)                                                     
                                                                 
 sequential_2 (Sequential)   (1, 1)                    129       
                                                                 
Total params: 805,505
Trainable params: 805,505
Non-trainable params: 0
_________________________________________________________________


## 4.- Entrenamiento

In [19]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [20]:
trans_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [21]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [22]:
epochs = 5

In [23]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, transformer.trainable_weights)
    trans_opt.apply_gradients(zip(gradients, transformer.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [24]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result():.4f}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result():.4f}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.6535
Val loss: 0.6038
Epoch: 1 Train loss: 0.5198
Val loss: 0.4720
Epoch: 2 Train loss: 0.3852
Val loss: 0.4055
Epoch: 3 Train loss: 0.3184
Val loss: 0.4029
Epoch: 4 Train loss: 0.2876
Val loss: 0.4169
