In [72]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import torch
import datasets
import evaluate
import pandas as pd
import numpy as np
import accelerate


train_dataset = pd.read_csv('train_ptbr.csv')
validation_dataset = pd.read_csv('test_ptbr.csv')
train_dataset.drop(columns=['id'], inplace=True)
validation_dataset.drop(columns=['id'], inplace=True)

classes = train_dataset.columns[1:].tolist()
class2id = {cls: i for i, cls in enumerate(classes)}
id2class = {i: cls for cls, i in class2id.items()}

def create_labels(row):
    return row[classes].astype(float).tolist()

train_dataset['labels'] = train_dataset.apply(create_labels, axis=1)
validation_dataset['labels'] = validation_dataset.apply(create_labels, axis=1)

In [73]:
!nvidia-smi
print("GPU available:", torch.cuda.is_available())

Thu Jul 10 03:13:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57                 Driver Version: 576.57         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   49C    P0             34W /  180W |    3681MiB /   8192MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [74]:
model_name = "neuralmind/bert-base-portuguese-cased" # google-bert/bert-base-multilingual-cased # distilbert/distilbert-base-multilingual-cased

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = datasets.Dataset.from_pandas(train_dataset)
validation_dataset = datasets.Dataset.from_pandas(validation_dataset)

train_tokenized_dataset = train_dataset.map(preprocess_function)
validation_tokenized_dataset = validation_dataset.map(preprocess_function)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2226/2226 [00:00<00:00, 5705.85 examples/s]
Map: 100%|██████████| 2226/2226 [00:00<00:00, 5555.66 examples/s]


In [75]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [76]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
label_matrix = np.stack(train_dataset['labels'])
class_counts = label_matrix.sum(axis=0)
class_weights = (label_matrix.shape[0] - class_counts) / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32)

def custom_compute_loss(outputs, labels, num_items_in_batch=None):
    logits = outputs.logits
    loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights.to(logits.device))
    loss = loss_fct(logits, labels.float())
    return loss

In [78]:
training_args = TrainingArguments(
    
   output_dir="multilabel_emotion",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   eval_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=train_tokenized_dataset,
   eval_dataset=validation_tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
   compute_loss_func=custom_compute_loss,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2184,1.051377,0.874513,0.598851,0.565041,0.636965
2,0.9974,1.040492,0.870246,0.609245,0.546742,0.687882


TrainOutput(global_step=1484, training_loss=1.0244938318298833, metrics={'train_runtime': 233.775, 'train_samples_per_second': 19.044, 'train_steps_per_second': 6.348, 'total_flos': 292853121878016.0, 'train_loss': 1.0244938318298833, 'epoch': 2.0})

In [80]:
text = "to com tanta dor na gengiva que ta me dando dor de cabeça pqp"

encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

predictions = outputs.logits.detach().cpu().numpy()
predictions = sigmoid(predictions)
filtered_predictions = (predictions > 0.5).astype(int).reshape(-1)

predicted_labels = [id2class[i] for i, pred in enumerate(filtered_predictions) if pred == 1]
print(f"Texto: {text}\n")
print(f"Labels previstos: {predicted_labels}\n")


predictions = predictions[0]
sorted_indices = np.argsort(predictions)[::-1]
print("Labels e suas respectivas probabilidades (desc):")
for idx in sorted_indices:
    print(f"- {id2class[idx]}: {predictions[idx]:.4f}")

Texto: to com tanta dor na gengiva que ta me dando dor de cabeça pqp

Labels previstos: ['fear', 'sadness']

Labels e suas respectivas probabilidades (desc):
- sadness: 0.9468
- fear: 0.6491
- anger: 0.4898
- surprise: 0.3998
- disgust: 0.3391
- joy: 0.0872
