# Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch import nn
from torch.utils.data import Dataset, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Tokenizador e modelo do BERT
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Dataset de treino
train = pd.read_csv('./data/train.csv')
train.head(5)

Unnamed: 0,text,tokenized_text,homophobia,obscene,insult,racism,sexism,xenophobia
0,tipo lomotif quero fazer final ano,"['tipo', 'lomotif', 'querer', 'fazer', 'final'...",0,0,0,0,0,0
1,carioca exaltando rio janeiro coisas dao ranço...,"['carioca', 'exaltar', 'rio', 'Janeiro', 'cois...",0,0,0,0,0,0
2,dejen tomarse fotos trabajen verdad sean ridíc...,"['dejen', 'tomarse', 'foto', 'trabajen', 'verd...",0,0,1,0,0,0
3,lembro nada sei queria fechar pau ontem kkkkkk...,"['lembro', 'nada', 'saber', 'querer', 'fechar'...",0,1,0,0,0,0
4,deve saber diferenciar gênero orientação sexua...,"['dever', 'saber', 'diferenciar', 'gênero', 'o...",0,0,1,0,0,0


In [5]:
# Dataset de teste
test = pd.read_csv('test.csv')
test.head(5)

Unnamed: 0,text,tokenized_text,homophobia,obscene,insult,racism,sexism,xenophobia
0,caralho merda semana inteira pra escola vou de...,"['Caralho', 'Merda', 'semano', 'inteira', 'pra...",0,0,1,0,0,0
1,voc锚 diz desistiu incontrolavelmente continua ...,"['voc锚', 'dizer', 'desistir', 'incontrolavelme...",0,0,0,0,0,0
2,nom fica tranquilo vivo,"['nom', 'ficar', 'tranquilo', 'vivo']",0,0,0,0,0,0
3,beijo offgun tol puta merdaaahskdj,"['beijor', 'offgun', 'tol', 'puta', 'merdaaahs...",0,1,0,0,0,0
4,raiva vsf,"['raiva', 'vsf']",0,0,0,0,0,0


# Preparo dos dados

## Dataset de Treino

In [6]:
# Tokenização e conversão pra tensores
inputs = tokenizer(train['tokenized_text'].tolist(), truncation=True, padding=True, max_length=512)
input_ids = torch.tensor(inputs['input_ids'])
attention_mask = torch.tensor(inputs['attention_mask'])

In [7]:
# Preparo dos rótulos
labels = train[['homophobia', 'obscene', 'insult', 'racism', 'sexism', 'xenophobia']].values
labels = torch.tensor(labels, dtype=torch.float)

In [8]:
# Criação do dataset
train_dataset = TensorDataset(input_ids, attention_mask, labels)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, tensor_dataset):
        self.tensor_dataset = tensor_dataset

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.tensor_dataset[idx]
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

    def __len__(self):
        return len(self.tensor_dataset)

# Convertendo TensorDataset para CustomDataset
train_dataset = CustomDataset(train_dataset)

In [10]:
print(train_dataset[0])

{'input_ids': tensor([  101,  1031,  1005,  5955,  2080,  1005,  1010,  1005,  8840, 18938,
        10128,  1005,  1010,  1005, 10861, 14544,  1005,  1010,  1005,  6904,
         6290,  1005,  1010,  1005,  2345,  1005,  1010,  1005,  2019,  2080,
         1005,  1033,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

## Dataset de Validação

In [11]:
# Tokenização e conversão pra tensores
inputs = tokenizer(test['tokenized_text'].tolist(), truncation=True, padding=True, max_length=512)
input_ids = torch.tensor(inputs['input_ids'])
attention_mask = torch.tensor(inputs['attention_mask'])

In [12]:
# Preparo dos rótulos
labels = test[['homophobia', 'obscene', 'insult', 'racism', 'sexism', 'xenophobia']].values
labels = torch.tensor(labels, dtype=torch.float)

In [13]:
# Criação do dataset
test_dataset = TensorDataset(input_ids, attention_mask, labels)

In [14]:
test_dataset = CustomDataset(test_dataset)

In [15]:
print(test_dataset[0])

{'input_ids': tensor([  101,  1031,  1005, 14418, 28061,  1005,  1010,  1005, 21442,  2850,
         1005,  1010,  1005,  7367,  2386,  2080,  1005,  1010,  1005, 20014,
        21302,  1005,  1010,  1005, 10975,  2050,  1005,  1010,  1005,  9686,
        26289,  1005,  1010,  1005, 20868,  1005,  1010,  1005, 14866,  2595,
         2080,  1005,  1010,  1005,  9253, 21748,  2080,  1005,  1010,  1005,
         1047, 19658, 19658, 19658, 19658,  2243,  1005,  1010,  1005,  9353,
        28040, 17119,  1005,  1010,  1005,  4012, 14031,  1005,  1033,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

# Treinamento do Modelo

In [16]:
# Argumentos de treinamento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',           
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
# Inicializar o Trainer
trainer = Trainer(
    model=model,                         # O modelo a ser treinado
    args=training_args,                  # Argumentos de treinamento
    train_dataset=train_dataset,         # Conjunto de dados de treinamento
    eval_dataset=test_dataset            # Conjunto de dados de avaliação
)

In [18]:
# Check the type of the dataset
print("Type of test_dataset:", type(test_dataset))

# Check the length of the dataset
print("Length of test_dataset:", len(test_dataset))

# Check the type and value of the first item in the dataset
first_item = test_dataset[0]
print("Type of first item in test_dataset:", type(first_item))
print("First item in test_dataset:", first_item)

Type of test_dataset: <class '__main__.CustomDataset'>
Length of test_dataset: 6988
Type of first item in test_dataset: <class 'dict'>
First item in test_dataset: {'input_ids': tensor([  101,  1031,  1005, 14418, 28061,  1005,  1010,  1005, 21442,  2850,
         1005,  1010,  1005,  7367,  2386,  2080,  1005,  1010,  1005, 20014,
        21302,  1005,  1010,  1005, 10975,  2050,  1005,  1010,  1005,  9686,
        26289,  1005,  1010,  1005, 20868,  1005,  1010,  1005, 14866,  2595,
         2080,  1005,  1010,  1005,  9253, 21748,  2080,  1005,  1010,  1005,
         1047, 19658, 19658, 19658, 19658,  2243,  1005,  1010,  1005,  9353,
        28040, 17119,  1005,  1010,  1005,  4012, 14031,  1005,  1033,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0, 

In [19]:
# check if cuda is available
print(torch.cuda.is_available())

True


In [20]:
trainer.train()

  1%|          | 92/13105 [11:18<26:59:44,  7.47s/it]

KeyboardInterrupt: 