<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/18_DistilBERT_hf_cyberbullying.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Clasificación de cyberbullying con DistilBERT

- Dataset: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification

In [1]:
import warnings
warnings.simplefilter('ignore')
import torch
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel

torch.__version__, transformers.__version__

('2.2.1+cu121', '4.39.1')

## 1.- Conjuntos de datos

In [2]:
df = pd.read_csv('./cyberbullying_tweets.csv')   

In [3]:
df['cyberbullying_type'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [4]:
# Contar elementos por clase
count = df['cyberbullying_type'].value_counts()
count

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [5]:
# Creamos un diccionario que mapea cada etiqueta a un número entero
labels_dict = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

df['cyberbullying_type'] = df['cyberbullying_type'].replace(labels_dict)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0


In [6]:
hf_df = pd.DataFrame()
hf_df['text'] = df['tweet_text']
hf_df['labels'] = df['cyberbullying_type']

In [7]:
hf_df.head()

Unnamed: 0,text,labels
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0


## 2.- Pipeline

In [8]:
maxlen = 128
batch_size = 4

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [9]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.int64)
        }

In [10]:
train_size = 0.85
train_data = hf_df.sample(frac=train_size,random_state=123)
test_data = hf_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

train_data = MultiLabelDataset(train_data, tokenizer, maxlen)
test_data = MultiLabelDataset(test_data, tokenizer, maxlen)
len(train_data), len(test_data)

(40538, 7154)

In [11]:
train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': batch_size,
               'num_workers': 0
               }

train_loader = DataLoader(train_data, **train_params)
test_loader = DataLoader(test_data, **test_params)

In [12]:
train_batch = next(iter(train_loader))

## 3.- Modelo

In [13]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)

        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

ids = train_batch['ids']
mask = train_batch['mask']
token_type_ids = train_batch['token_type_ids']
targets = train_batch['targets']

model = DistilBERTClass()
outputs = model(ids, mask, token_type_ids)
outputs

tensor([[ 0.0675,  0.2344,  0.1693,  0.0497, -0.1053,  0.2172],
        [-0.0024,  0.1938,  0.2003, -0.0589,  0.0151,  0.0638],
        [-0.0726,  0.2102,  0.0098,  0.0473, -0.0786,  0.0720],
        [-0.1934,  0.1338,  0.0042,  0.1424, -0.0898, -0.1170]],
       grad_fn=<AddmmBackward0>)

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [15]:
model.to(device)
lr = 1e-05
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 4.- Entrenamiento

In [16]:
import time
from tqdm import tqdm

In [17]:
def test(model, device, test_loader):
    start = time.time()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for data in test_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['targets'].to(device)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, labels)
            _, pred = torch.max(outputs.data, 1)
            running_acc += (pred == labels).sum().item()
            running_loss += loss.item()

    val_acc = running_acc / len(test_loader.dataset)
    print(f'Time for eval is {time.time()-start:.4f} sec Val loss: {running_loss / len(test_loader):.4f}')
    print(f'Val acc: {val_acc:.4f}')
    return val_acc

In [18]:
len(train_loader.dataset)

40538

In [19]:
def train(epoch, train_loader, test_loader, interval=300):
    running_loss = 0.0
    model.train()
    for _, data in tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['targets'].to(device)

        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, labels)
        
        running_loss += loss.item()
        if _ % interval == 0:
            print(f'Train loss: {loss.item():.4f}')
            val_acc = test(model, device, test_loader)
            if val_acc > 0.82:
                break

        loss.backward()
        optimizer.step()

In [20]:
epochs = 1

In [21]:
for epoch in range(epochs):
    train(epoch, train_loader, test_loader)

0it [00:00, ?it/s]

Train loss: 1.7862


8it [00:09,  1.12it/s]

Time for eval is 9.5582 sec Val loss: 1.7965
Val acc: 0.1698


296it [00:14, 56.55it/s]

Train loss: 0.8581


308it [00:24,  2.85it/s]

Time for eval is 9.5105 sec Val loss: 0.5894
Val acc: 0.7810


596it [00:29, 56.47it/s]

Train loss: 0.6005


608it [00:39,  2.84it/s]

Time for eval is 9.5617 sec Val loss: 0.4974
Val acc: 0.8030


896it [00:44, 56.09it/s]

Train loss: 0.4793


908it [00:54,  2.84it/s]

Time for eval is 9.5418 sec Val loss: 0.4719
Val acc: 0.8194


1196it [00:59, 56.35it/s]

Train loss: 0.1986


1208it [01:09,  2.84it/s]

Time for eval is 9.5611 sec Val loss: 0.4597
Val acc: 0.8126


1496it [01:14, 55.86it/s]

Train loss: 0.2052


1500it [01:24, 17.79it/s]

Time for eval is 9.6957 sec Val loss: 0.4303
Val acc: 0.8260





- Guarda modelo

In [22]:
import os

In [23]:
os.makedirs('./models/', exist_ok=True)
output_model_file = './models/pytorch_distilbert_cyberbullying.bin'
output_vocab_file = './models/vocab_distilbert_cyberbullying.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('./models/vocab_distilbert_cyberbullying.bin',)