In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/26_Linformer.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Linformer
  
- Dataset: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification

In [2]:
import torch
import pandas as pd

torch.__version__

'2.1.1'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7f2c3bdb5df0>

## 1.- Conjuntos de entrenamiento y validación

In [4]:
df = pd.read_csv('./cyberbullying_tweets.csv')   

In [5]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [6]:
df['cyberbullying_type'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [7]:
# Contar elementos por clase
count = df['cyberbullying_type'].value_counts()
count

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [8]:
# Creamos un diccionario que mapea cada etiqueta a un número entero
labels_dict = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

df['cyberbullying_type'] = df['cyberbullying_type'].replace(labels_dict)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0


- Elimina puntuación y convierte a minúsculas
- Se utiliza el método __str.translate()__ para eliminar todos los caracteres de puntuación mediante una tabla de traducción creada con el método __str.maketrans__. La constante string.punctuation contiene todos los caracteres de puntuación ASCII, que se eliminan de los valores en la columna

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
df['tweet_text'] = df['tweet_text'].str.lower().str.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,in other words katandandre your food was crapi...,0
1,why is aussietv so white mkr theblock imaceleb...,0
2,xochitlsuckkks a classy whore or more red velv...,0
3,jasongio meh p thanks for the heads up but no...,0
4,rudhoeenglish this is an isis account pretendi...,0


In [11]:
from sklearn.model_selection import train_test_split
train_df, val_df= train_test_split(df, test_size = 0.15, random_state = 123)

In [12]:
train_df['cyberbullying_type'].value_counts()

cyberbullying_type
0    6826
1    6821
5    6777
4    6763
2    6743
3    6608
Name: count, dtype: int64

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [13]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [14]:
tokenizer = get_tokenizer('basic_english')

In [15]:
def build_vocab(text, tokenizer, min_freq=5):
    counter = Counter()
    for string_ in text:
        counter.update(tokenizer(string_))
    return Vocab(counter, min_freq=min_freq,
                 specials=['<unk>', '<pad>'])


vocab = build_vocab(train_df.tweet_text.values, tokenizer, 5)

In [16]:
vocab_size = len(vocab) + 1
vocab_size

9952

In [17]:
vocab.set_default_index(len(vocab)) # evita error <ukn>

In [18]:
maxlen = 64

def data_process(x, y):
    data = []
    for raw_txt, target in zip(x, y):
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                        dtype=torch.long)
        if tensor_.shape[0] <= maxlen:
            # int64 to avoid CrossEntropyLoss "expected scalar type Long but found Float"
            target_ = torch.tensor(target, dtype=torch.int64)
            data.append((tensor_, target_))
    return data

train_data = data_process(train_df.tweet_text.values, 
                          train_df.cyberbullying_type.values)
val_data = data_process(val_df.tweet_text.values, 
                        val_df.cyberbullying_type.values)
len(train_data), len(val_data)

(40523, 7149)

In [19]:
batch_size = 64
PAD_IDX = vocab['<pad>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(y_item)

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    # int64 to avoid CrossEntropyLoss "expected scalar type Long but found Float"
    y = torch.tensor(y, dtype=torch.int64) 
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

In [20]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

64.8 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
train_batch, target_batch = next(iter(train_loader))

In [22]:
train_batch.shape, target_batch.shape

(torch.Size([64, 63]), torch.Size([64]))

## 3.- Modelo

In [23]:
import torch.nn as nn
from torch import optim
import time

<img src="../img/linformer.png" width="700"/>

__Imagen tomada de Wang, S., Li, B. Z., Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

Dot-porduct attention:

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\end{equation}

Low-Rank attention:
\begin{align}
\text{head}_i &= \mbox{Attention}(QW_i^Q, E_iKW_i^K, F_iVW_i^V)\notag\\
&=\underbrace{\mbox{softmax}\left(\frac{QW_i^Q(E_iKW_i^K)^T}{\sqrt{d_k}}\right)}_{\bar{P}: n\times k}\cdot\underbrace{F_iVW_i^V}_{k\times d},
\end{align}

In [24]:
class LinformerAttention(nn.Module):
    def __init__(self, seq_len, dim, n_heads, k, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.E = nn.Parameter(torch.randn(seq_len, k))
        self.F = nn.Parameter(torch.randn(seq_len, k))

        self.ow = nn.Linear(dim, dim, bias = bias)

    def forward(self, x):
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 3, 1])
        k = torch.matmul(k, self.E[:L, :])

        v = torch.matmul(v, self.F[:L, :])
        v = torch.permute(v, [0, 1, 3, 2])

        qk = torch.matmul(q, k) * self.scale
        attn = torch.softmax(qk, dim=-1)
        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x

test_layer = LinformerAttention(256, 32, 2, 64)
test_layer(torch.ones([1, 255, 32])).shape

torch.Size([1, 255, 32])

In [25]:
class TransformerBlock(nn.Module):
    def __init__(self, seq_len, dim, heads, mlp_dim, k, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = LinformerAttention(seq_len, dim, heads, k)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x

test_layer = TransformerBlock(256, 64, 2, 256, 64)
test_layer(torch.ones([1, 256, 64])).shape

torch.Size([1, 256, 64])

In [26]:
class Transformer(nn.Module):
    def __init__(self, seq_len, dim, vocab_size, maxlen, depth=3, 
                 heads=4, mlp_dim=512, k=32, rate=0.0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(
                TransformerBlock(seq_len, dim, heads, mlp_dim, k, rate)
            )

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, 6),
        )

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = x[:, 0]
        return self.mlp_head(x)   

model_dim = 128
depth = 1
mlp_dim = 256
heads = 4

transformer = Transformer(maxlen, 
    dim=model_dim, vocab_size=vocab_size, 
    maxlen=maxlen, depth=depth, heads=heads, mlp_dim=mlp_dim)

test_out = transformer(train_batch)
test_out.shape

torch.Size([64, 6])

## 4.- Entrenamiento

In [27]:
loss_fn = torch.nn.CrossEntropyLoss()

In [28]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

In [29]:
def test(model, device, test_loader):
    start = time.time()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = loss_fn(outputs, labels)
            _, pred = torch.max(outputs.data, 1)
            running_acc += (pred == labels).sum().item()
            running_loss += loss.item()

    print(f'Time for eval is {time.time()-start:.4f} sec Val loss: {running_loss / len(test_loader):.4f}')
    print(f'Val acc: {running_acc / len(test_loader.dataset):.4f}')

In [30]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [31]:
transformer.to(device)

transformer_optimizer = optim.Adam(transformer.parameters(), lr=0.001)

In [32]:
epochs = 7

for epoch in range(epochs):
    train(transformer, device, train_loader, transformer_optimizer, epoch)
    test(transformer, device, val_loader)


Time for epoch 0 is 1.8189 sec Train loss: 1.1645
Time for eval is 0.2072 sec Val loss: 0.7601
Val acc: 0.6948

Time for epoch 1 is 1.7263 sec Train loss: 0.6239
Time for eval is 0.1767 sec Val loss: 0.5574
Val acc: 0.7607

Time for epoch 2 is 1.6697 sec Train loss: 0.4916
Time for eval is 0.1882 sec Val loss: 0.5001
Val acc: 0.7976

Time for epoch 3 is 1.6289 sec Train loss: 0.4265
Time for eval is 0.1731 sec Val loss: 0.4708
Val acc: 0.7966

Time for epoch 4 is 1.7037 sec Train loss: 0.3760
Time for eval is 0.1797 sec Val loss: 0.4704
Val acc: 0.8028

Time for epoch 5 is 1.6937 sec Train loss: 0.3469
Time for eval is 0.1766 sec Val loss: 0.4818
Val acc: 0.8141

Time for epoch 6 is 1.6928 sec Train loss: 0.3162
Time for eval is 0.1973 sec Val loss: 0.4897
Val acc: 0.8197
