## Construcción de un modelo de clasificación de texto

In [9]:
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

use_cpu = True

In [5]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [6]:
def tokenize(batch):
    breakpoint()
    return tokenizer(batch["text"], padding=True, truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
    }

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [7]:
# Load the dataset
emotions = load_dataset("emotion")

# Tokenize the dataset
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_ckpt = "distilbert-base-uncased"
model_name = f"{model_ckpt}-finetuned-emotion"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  use_cpu=use_cpu,
                                  log_level="error")

num_labels = 6
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to("cpu"))

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)

trainer.train()

  trainer = Trainer(model=model, args=training_args,


Step,Training Loss
250,0.8285
500,0.2505


TrainOutput(global_step=500, training_loss=0.5395275268554688, metrics={'train_runtime': 1768.9912, 'train_samples_per_second': 18.089, 'train_steps_per_second': 0.283, 'total_flos': 720342861696000.0, 'train_loss': 0.5395275268554688, 'epoch': 2.0})

In [None]:
preds_output = trainer.predict(emotions_encoded["validation"])

preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

y_valid = np.array(emotions_encoded["validation"]["label"])

labels = emotions["train"].features["label"].names

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from torch.nn.functional import softmax

textos_prueba = [
    "I love the basketball court, but I don't like that it's unpainted, but I would definitely go back.",
    "I'm feeling really down today.",
    "This movie was absolutely hilarious!",
    "I have no opinion about the event.",
]

inputs = tokenizer(textos_prueba, return_tensors="pt", padding=True, truncation=True)

device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = softmax(logits, dim=-1)
    predicted_class_ids = torch.argmax(probs, dim=-1)

label_names = emotions_encoded["train"].features["label"].names

for texto, class_id, prob in zip(textos_prueba, predicted_class_ids, probs):
    predicted_label = label_names[class_id.item()]
    confidence = prob[class_id].item()
    print(f"Texto: {texto}")
    print(f"→ Predicción: {predicted_label} (Confianza: {confidence:.4f})\n")

for i, texto in enumerate(textos_prueba):
    print(f"Texto: {texto}")
    for j, label in enumerate(label_names):
        print(f"  {label:12}: {probs[i][j].item():.4f}")
    print()