<a href="https://colab.research.google.com/github/mjl-ai/TFM-Sentimiento-Politico-Transformers/blob/main/notebooks/TransformerLigeros_caso_balanceado_y_desbalanceado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#✅ CÓDIGO 1 (COLAB) — DESBALANCEADO

Copia y pega tal cual en una celda (o en varias, pero es un solo script).

In [None]:
# ============================================================
# TFM - NLP Sentimiento Político (Transformers ligeros)
# NOTEBOOK 1/2: DESBALANCEADO
# Google Colab - transformers==4.44.2
#
# Entrenamiento SOLO con:
#   - text_tr         (normalizado con tildes, preserva contexto)
#   - text_tr_noacc   (normalizado sin tildes)
#
# EDA (solo análisis):
#   - EDA agresivo (stopwords+limpieza fuerte) para TF-IDF / Chi2 / WordCloud
#   - TF-IDF y Chi2: filtra términos {"marca","numero","número","escribe"}
#
# ZIP: incluye SOLO lo necesario para el TFM (PNG, CSV, MD, TXT, JSON)
#     NO incluye checkpoints pesados -> evita errores de memoria
# ============================================================

!pip -q install -U "transformers==4.44.2" datasets evaluate accelerate scikit-learn wordcloud ftfy

import os, re, json, gc, random, zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from ftfy import fix_text
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, roc_auc_score

import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    set_seed, EarlyStoppingCallback
)
from wordcloud import WordCloud


# ---------------------------
# 1) CONFIG
# ---------------------------
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

GITHUB_RAW_URL = "https://raw.githubusercontent.com/danielalva2008/TFM/master/TFM_UNIR/DATASETS/2.-Dataset%20Trabajados/data_extended.csv"

SCENARIO = "desbalanceado"
ZIP_NAME = "TFM_Desbalanceado_outputs.zip"

OUTPUT_DIR = "/content/outputs_desbalanceado"
FIG_DIR    = os.path.join(OUTPUT_DIR, "figures")
REP_DIR    = os.path.join(OUTPUT_DIR, "reports")
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REP_DIR, exist_ok=True)

MODELS = {
    "DistilBERT-es": "dccuchile/distilbert-base-spanish-uncased",
    "ALBETO Tiny": "dccuchile/albert-tiny-spanish",
    "MiniLM Multilingual": "microsoft/Multilingual-MiniLM-L12-H384",
}

TEXT_COL  = "full_text"
LABEL_COL = "label"
LABEL_NAME = {0: "negativo", 1: "neutro", 2: "positivo"}
VALID_LABELS = set(LABEL_NAME.keys())

# Colores por clase (barras)
CLASS_COLORS = {0: "red", 1: "gray", 2: "green"}

# Entrenamiento
MAX_LEN = 128
TEST_SIZE = 0.15
VAL_SIZE  = 0.15

BATCH_SIZE = 16
GRAD_ACC = 1
#TARGET_EFFECTIVE_BATCH = 32           # <- ajusta si deseas
#GRAD_ACC = max(1, TARGET_EFFECTIVE_BATCH // BATCH_SIZE)

EPOCHS = 4
LR = 2e-5
USE_FP16 = torch.cuda.is_available()
EARLY_STOPPING_PATIENCE = 2

# Solo 2 variantes para entrenamiento (lo que pediste)
TEXT_VARIANTS_FOR_TRAIN = ["text_tr", "text_tr_noacc"]

# Filtro solicitado SOLO para TF-IDF/Chi2
TFIDF_CHI2_BANNED = {"marca", "numero", "número", "escribe"}


# ---------------------------
# 2) HELPERS
# ---------------------------
def clear_gpu():
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    except Exception:
        pass

def save_fig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=220, bbox_inches="tight")
    plt.close()

def fix_text_utf8(s: str) -> str:
    return fix_text(str(s))

def remove_accents(text: str) -> str:
    # equivalente al enfoque de DIACRITICAL_VOWELS (Alva)
    pairs = [('á','a'), ('é','e'), ('í','i'), ('ó','o'), ('ú','u'), ('ü','u')]
    t = str(text)
    for a, b in pairs:
        t = t.replace(a, b).replace(a.upper(), b.upper())
    return t

# ---------------------------
# 3) SLANG + STOPWORDS (Alva + ajuste mrd)
# ---------------------------
SLANG = [
    (r"\bmrd\b", "mierda"),
    (r"\bd\b", "de"),
    (r"\b[qk]\b", "que"),
    (r"\bxo\b", "pero"),
    (r"\bxa\b", "para"),
    (r"\b[xp]q\b", "porque"),
    (r"\bes[qk]\b", "es que"),
    (r"\bfvr\b", "favor"),
    (r"\b(xfa|xf|pf|plis|pls|porfa)\b", "por favor"),
    (r"\bdnd\b", "donde"),
    (r"\btb\b", "también"),
    (r"\b(tq|tk)\b", "te quiero"),
    (r"\b(tqm|tkm)\b", "te quiero mucho"),
    (r"\bx\b", "por"),
    (r"\+", "mas"),
    (r"\bpiña\b", "mala suerte"),
    (r"\bagarre\b", "adulterio"),
    (r"\bampay\b", "verguenza"),
    (r"\bbacan\b", "alegria"),
    (r"\bbamba\b", "falsificado"),
    (r"\bcabeceador\b", "ladron"),
    (r"\bcabro\b", "homosexual"),
    (r"\bcachaciento\b", "burlon"),
    (r"\bcalabacita\b", "tonta"),
    (r"\bcaleta\b", "secreto"),
    (r"\bcana\b", "carcel"),
    (r"\bchucha\b", "molestia"),
    (r"\bchoro\b", "ladron"),
    (r"\bconchán\b", "conchudo"),
    (r"\bcutra\b", "ilicito"),
    (r"\bdark\b", "horrible"),
    (r"\blenteja\b", "torpe"),
    (r"\blorna\b", "tonto"),
    (r"\bmancar\b", "morir"),
    (r"\bmonse\b", "tonto"),
    (r"\bpiñata\b", "mala suerte"),
]

# Stopwords (lista grande de Alva). Mantengo la idea: se usa SOLO en EDA agresivo.
# Nota: no las uso en entrenamiento transformers.
stop_words = set([
    'a', 'adelante', 'además', 'afirmó', 'agregó', 'ahi', 'ahora', 'ahí', 'al',
    'algo', 'alguna', 'algunas', 'alguno', 'algunos', 'algún', 'alrededor', 'ambos',
    'ampleamos', 'ante', 'anterior', 'antes', 'apenas', 'aproximadamente', 'aquel',
    'aquellas', 'aquellos', 'aqui', 'aquí', 'arriba', 'as', 'aseguró', 'asi', 'así',
    'atras', 'aunque', 'aún', 'ayer', 'b', 'bajo', 'bastante', 'bien', 'buen',
    'buena', 'buenas', 'bueno', 'buenos', 'c', 'cada', 'casi', 'cc', 'cerca',
    'cierta', 'ciertas', 'cierto', 'ciertos', 'cinco', 'comentó', 'como', 'con',
    'conocer', 'conseguimos', 'conseguir', 'considera', 'consideró', 'consigue',
    'consiguen', 'consigues', 'consigo', 'contra', 'cosas', 'creo', 'cual',
    'cuales', 'cualquier', 'cuando', 'cuanto', 'cuatro', 'cuenta', 'cómo', 'd',
    'da', 'dado', 'dan', 'dar', 'de', 'debe', 'deben', 'debido', 'decir', 'dejó',
    'del', 'demás', 'dentro', 'desde', 'después', 'dia', 'dice', 'dicen', 'dicho',
    'dieron', 'diferente', 'diferentes', 'dijeron', 'dijo', 'dio', 'do', 'don',
    'donde', 'dos', 'durante', 'e', 'ejemplo', 'el', 'ella', 'ellas', 'ello',
    'ellos', 'embargo', 'empleais', 'emplean', 'emplear', 'empleas', 'empleo',
    'en', 'encima', 'encuentra', 'entonces', 'entre', 'era', 'eramos', 'eran',
    'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba',
    'estaban', 'estado', 'estais', 'estamos', 'estan', 'estar', 'estará', 'estas',
    'este', 'esto', 'estos', 'estoy', 'estuvo', 'ex', 'existe', 'existen',
    'explicó', 'expresó', 'f', 'fin', 'fue', 'fuera', 'fueron', 'fui', 'fuimos',
    'g', 'gran', 'grandes', 'gueno', 'h', 'ha', 'haber', 'había', 'habían',
    'habrá', 'hace', 'haceis', 'hacemos', 'hacen', 'hacer', 'hacerlo', 'haces',
    'hacia', 'haciendo', 'hago', 'han', 'has', 'hasta', 'hay', 'haya', 'he',
    'hecho', 'hemos', 'hicieron', 'hizo', 'hoy', 'hubo', 'i', 'igual', 'incluso',
    'indicó', 'informó', 'intenta', 'intentais', 'intentamos', 'intentan',
    'intentar', 'intentas', 'intento', 'ir', 'is', 'j', 'junto', 'k', 'l', 'la',
    'lado', 'largo', 'las', 'le', 'les', 'llegó', 'lleva', 'llevar', 'lo', 'los',
    'luego', 'lugar', 'm', 'manera', 'manifestó', 'mas', 'mayor', 'me', 'mediante',
    'mejor', 'mencionó', 'menos', 'mi', 'mientras', 'mio', 'mis', 'misma',
    'mismas', 'mismo', 'mismos', 'mo', 'modo', 'momento', 'mucha', 'muchas',
    'mucho', 'muchos', 'muy', 'más', 'n', 'nada', 'nadie', 'ni', 'ninguna',
    'ningunas', 'ninguno', 'ningunos', 'ningún', 'nos', 'nosotras', 'nosotros',
    'nuestra', 'nuestras', 'nuestro', 'nuestros', 'nueva', 'nuevas', 'nuevo',
    'nuevos', 'nunca', 'o', 'ocho', 'of', 'otra', 'otras', 'otro', 'otros',
    'p', 'pa', 'para', 'parece', 'parte', 'partir', 'pasada', 'pasado', 'pero',
    'pesar', 'poca', 'pocas', 'poco', 'pocos', 'podeis', 'podemos', 'poder',
    'podra', 'podrán', 'podria', 'podriais', 'podriamos', 'podrian', 'podrias',
    'podría', 'podrían', 'poner', 'por', 'porque', 'por qué', 'posible', 'pq',
    'primer', 'primera', 'primero', 'primeros', 'principalmente', 'propia',
    'propias', 'propio', 'propios', 'próximo', 'próximos', 'pt', 'pudo', 'pueda',
    'puede', 'pueden', 'puedo', 'pues', 'q', 'qie', 'qu', 'que', 'quedó',
    'queremos', 'quien', 'quienes', 'quiere', 'qué', 'quién', 'r', 'realizado',
    'realizar', 'realizó', 'respecto', 's', 'sabe', 'sabeis', 'sabemos', 'saben',
    'saber', 'sabes', 'se', 'sea', 'sean', 'segunda', 'segundo', 'según', 'seis',
    'sera', 'será', 'serán', 'sería', 'si', 'sido', 'siempre', 'siendo', 'siete',
    'sigue', 'siguiente', 'sin', 'sino', 'sobre', 'sois', 'sola', 'solamente',
    'solas', 'solo', 'solos', 'somos', 'son', 'soy', 'su', 'sua', 'sus', 'sí',
    'sólo', 'señaló', 't', 'tal', 'también', 'tampoco', 'tan', 'tanto', 'te',
    'tendrá', 'tendrán', 'teneis', 'tenemos', 'tener', 'tenga', 'tengo', 'tenía',
    'tenido', 'tercera', 'the', 'ti', 'tiempo', 'tiene', 'tienen', 'to', 'toda',
    'todas', 'todavía', 'todo', 'todos', 'total', 'trabaja', 'trabajais',
    'trabajamos', 'trabajan', 'trabajar', 'trabajas', 'trabajo', 'tras',
    'trata', 'través', 'tres', 'tu', 'tus', 'tuvo', 'tuyo', 'u', 'un', 'una',
    'unas', 'uno', 'unos', 'us', 'usa', 'usais', 'usamos', 'usan', 'usar',
    'usas', 'uso', 'usted', 'v', 'va', 'vais', 'valor', 'vamos', 'van',
    'varias', 'varios', 'vaya', 'veces', 'ven', 'ver', 'verdad', 'verdadera',
    'verdadero', 'vez', 'via', 'vosotras', 'vosotros', 'voy', 'w', 'x', 'y',
    'ya', 'yo', 'z', 'á', 'ésta', 'éste', 'éstas', 'éstos', 'última', 'últimas',
    'ultimo', 'último', 'últimos'
])


def apply_slang(t: str) -> str:
    for pat, rep in SLANG:
        t = re.sub(pat, rep, t)
    return t

# ---------------------------
# 4) TEXTO PARA ENTRENAMIENTO (preserva contexto)
# ---------------------------
def normalize_for_transformer(text: str, remove_tilde: bool = False) -> str:
    """
    Normalización suave:
    - corrige mojibake (ftfy)
    - baja a minúsculas
    - quita URLs, @menciones, #hashtags
    - aplica SLANG (incluye mrd->mierda)
    - limpia caracteres raros PERO conserva tildes/ñ
    - NO elimina stopwords (para preservar contexto)
    """
    t = fix_text_utf8(text).lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"#\w+", " ", t)
    t = apply_slang(t)

    # deja letras + tildes + ñ + espacios
    t = re.sub(r"[^a-záéíóúñü\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    if remove_tilde:
        t = remove_accents(t)
        t = re.sub(r"[^a-zñ\s]", " ", t)
        t = re.sub(r"\s+", " ", t).strip()

    return t

# ---------------------------
# 5) TEXTO PARA EDA (agresivo, solo análisis)
# ---------------------------
def eda_aggressive(text: str) -> str:
    """
    EDA agresivo:
    - corrige mojibake
    - sin tildes
    - quita URLs/@/# y símbolos
    - aplica SLANG
    - elimina stopwords
    """
    t = fix_text_utf8(text).lower()
    t = apply_slang(t)
    t = remove_accents(t)
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"#\w+", " ", t)
    t = re.sub(r"[^a-zñ\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    toks = [w for w in t.split() if (len(w) > 2 and w not in stop_words)]
    return " ".join(toks)

def term_allowed_for_tfidf(term: str) -> bool:
    parts = term.split()
    return not any(p in TFIDF_CHI2_BANNED for p in parts)

# ---------------------------
# 6) CARGA DATASET
# ---------------------------
print("📥 Leyendo CSV desde GitHub RAW ...")
df = pd.read_csv(GITHUB_RAW_URL)

assert TEXT_COL in df.columns, f"Falta columna: {TEXT_COL}"
assert LABEL_COL in df.columns, f"Falta columna: {LABEL_COL}"

df = df[[TEXT_COL, LABEL_COL]].copy()
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df[df[TEXT_COL].str.len() > 0]

df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors="coerce")
df = df.dropna(subset=[LABEL_COL])
df[LABEL_COL] = df[LABEL_COL].astype(int)
df = df[df[LABEL_COL].isin(VALID_LABELS)].reset_index(drop=True)

# Dedup por texto crudo
df = df.drop_duplicates(subset=[TEXT_COL]).reset_index(drop=True)

print("🧹 Creando columnas...")
df["text_tr"] = df[TEXT_COL].apply(lambda x: normalize_for_transformer(x, remove_tilde=False))
df["text_tr_noacc"] = df[TEXT_COL].apply(lambda x: normalize_for_transformer(x, remove_tilde=True))
df["text_eda"] = df[TEXT_COL].apply(eda_aggressive)

df["label_name"] = df[LABEL_COL].map(LABEL_NAME)

print("✅ Shape final:", df.shape)
print("Distribución labels (ORIGINAL):")
print(df[LABEL_COL].value_counts().sort_index())

# HEAD para validar (raw vs train variants)
print("\n🔎 EJEMPLO (raw vs text_tr vs text_tr_noacc):")
display(df[[TEXT_COL, "text_tr", "text_tr_noacc", LABEL_COL, "label_name"]].head(8))


# ---------------------------
# 7) SPLITS (DESBALANCEADO)
# ---------------------------
def split_stratified(df_in: pd.DataFrame, label_col: str, test_size=0.15, val_size=0.15, seed=42):
    df_train, df_temp = train_test_split(
        df_in, test_size=(test_size + val_size),
        stratify=df_in[label_col], random_state=seed
    )
    rel_test = test_size / (test_size + val_size)
    df_val, df_test = train_test_split(
        df_temp, test_size=rel_test,
        stratify=df_temp[label_col], random_state=seed
    )
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

train_df, val_df, test_df = split_stratified(df, LABEL_COL, TEST_SIZE, VAL_SIZE, SEED)

print("\n📌 HEAD TRAIN (desbalanceado):")
display(train_df[[TEXT_COL, "text_tr", "text_tr_noacc", LABEL_COL, "label_name"]].head(8))

# ---------------------------
# 8) EDA FIGURES (distribución + comparación)
# ---------------------------
def plot_class_distribution(df_in: pd.DataFrame, title: str, fname: str):
    counts = df_in[LABEL_COL].value_counts().sort_index()
    labels = [LABEL_NAME[i].capitalize() for i in counts.index]
    colors = [CLASS_COLORS.get(i, "blue") for i in counts.index]
    plt.figure(figsize=(6,4))
    plt.bar(labels, counts.values, color=colors)
    plt.title(title)
    plt.ylabel("Número de tuits")
    plt.xlabel("Etiqueta")
    save_fig(os.path.join(FIG_DIR, fname))

plot_class_distribution(df, "Distribución de clases (original)", "01_dist_original.png")

# WordCloud por clase (usando text_eda)
def plot_wordclouds(df_in: pd.DataFrame, text_col: str, prefix: str):
    for lab in [0,1,2]:
        texts = df_in[df_in[LABEL_COL]==lab][text_col].astype(str).tolist()
        joined = " ".join(texts).strip()
        if not joined:
            continue
        wc = WordCloud(width=1200, height=600, background_color="white").generate(joined)
        plt.figure(figsize=(12,6))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"{prefix} - WordCloud ({LABEL_NAME[lab]})")
        save_fig(os.path.join(FIG_DIR, f"04_{SCENARIO}_wordcloud_{lab}.png"))

plot_wordclouds(df, "text_eda", "Dataset desbalanceado (EDA)")

# ---------------------------
# 9) TF-IDF TOP + TF-IDF+chi2 TOP (filtrando marca/numero/número/escribe)
# ---------------------------
def tfidf_top_by_class(df_in: pd.DataFrame, text_col="text_eda", top_n=20, ngram_range=(1,2), min_df=3):
    texts = df_in[text_col].astype(str).tolist()
    y = df_in[LABEL_COL].astype(int).values

    vec = TfidfVectorizer(lowercase=True, ngram_range=ngram_range, min_df=min_df)
    X = vec.fit_transform(texts)
    terms = np.array(vec.get_feature_names_out())

    out = {}
    for lab in [0,1,2]:
        idx = np.where(y==lab)[0]
        mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
        order = np.argsort(mean_tfidf)[::-1]
        picked = []
        for j in order:
            term = terms[j]
            if term_allowed_for_tfidf(term):
                picked.append((term, float(mean_tfidf[j])))
            if len(picked) >= top_n:
                break
        out[lab] = picked
    return out

def chi2_top_by_class(df_in: pd.DataFrame, text_col="text_eda", top_n=20, ngram_range=(1,2), min_df=3):
    texts = df_in[text_col].astype(str).tolist()
    y = df_in[LABEL_COL].astype(int).values

    vec = TfidfVectorizer(lowercase=True, ngram_range=ngram_range, min_df=min_df)
    X = vec.fit_transform(texts)
    terms = np.array(vec.get_feature_names_out())

    out = {}
    for lab in [0,1,2]:
        scores, _ = chi2(X, (y==lab).astype(int))
        order = np.argsort(scores)[::-1]
        picked = []
        for j in order:
            term = terms[j]
            if term_allowed_for_tfidf(term):
                picked.append((term, float(scores[j])))
            if len(picked) >= top_n:
                break
        out[lab] = picked
    return out

def plot_kw_dict(kw_dict, title_prefix, fname_prefix, color_map=None):
    for lab in [0,1,2]:
        items = kw_dict.get(lab, [])
        if not items:
            continue
        words = [w for w,s in items]
        vals  = [s for w,s in items]
        plt.figure(figsize=(7,4))
        plt.bar(words, vals, color=CLASS_COLORS.get(lab, "blue"))
        plt.title(f"{title_prefix} ({LABEL_NAME[lab]})")
        plt.xticks(rotation=75, ha="right")
        save_fig(os.path.join(FIG_DIR, f"{fname_prefix}_{lab}.png"))

kw_tfidf = tfidf_top_by_class(df, "text_eda", top_n=20)
plot_kw_dict(kw_tfidf, "TF-IDF Top 20 - desbalanceado", "05_desbalanceado_tfidf_top20")

kw_chi2 = chi2_top_by_class(df, "text_eda", top_n=20)
plot_kw_dict(kw_chi2, "TF-IDF + χ² Top 20 - desbalanceado", "06_desbalanceado_chi2_top20")


# ---------------------------
# 10) HF DATASETS (solo columnas necesarias -> menos RAM)
# ---------------------------
def to_hf_dataset(df_train, df_val, df_test):
    keep_cols = [LABEL_COL] + TEXT_VARIANTS_FOR_TRAIN
    return DatasetDict(
        train=Dataset.from_pandas(df_train[keep_cols].reset_index(drop=True)),
        validation=Dataset.from_pandas(df_val[keep_cols].reset_index(drop=True)),
        test=Dataset.from_pandas(df_test[keep_cols].reset_index(drop=True)),
    )

ds = to_hf_dataset(train_df, val_df, test_df)


# ---------------------------
# 11) MÉTRICAS
# ---------------------------
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall_macro": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
    }


# ---------------------------
# 12) PLOTS: CM azul con números legibles + ROC multiclass
# ---------------------------
def plot_confusion_matrix_blue(cm, title, fname):
    plt.figure(figsize=(6,5))
    plt.imshow(cm, interpolation="nearest", cmap="Blues")
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(3)
    plt.xticks(tick_marks, [LABEL_NAME[i] for i in [0,1,2]])
    plt.yticks(tick_marks, [LABEL_NAME[i] for i in [0,1,2]])
    thresh = cm.max()/2.0 if cm.max()>0 else 0.5
    for i in range(3):
        for j in range(3):
            plt.text(j, i, int(cm[i,j]),
                     ha="center", va="center",
                     color="white" if cm[i,j] > thresh else "black",
                     fontsize=12, fontweight="bold")
    plt.ylabel("Real")
    plt.xlabel("Predicción")
    save_fig(os.path.join(FIG_DIR, fname))

def plot_multiclass_roc(y_true, y_proba, title, fname):
    classes = [0,1,2]
    y_bin = label_binarize(y_true, classes=classes)
    plt.figure(figsize=(7,6))
    for i, lab in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{LABEL_NAME[lab]} (AUC={roc_auc:.3f})")
    fpr_micro, tpr_micro, _ = roc_curve(y_bin.ravel(), y_proba.ravel())
    auc_micro = auc(fpr_micro, tpr_micro)
    plt.plot(fpr_micro, tpr_micro, linestyle="--", label=f"micro-avg (AUC={auc_micro:.3f})")
    plt.plot([0,1],[0,1], linestyle=":", linewidth=1)
    plt.xlim([0.0,1.0]); plt.ylim([0.0,1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    save_fig(os.path.join(FIG_DIR, fname))


# ---------------------------
# 13) CARGA SEGURA (ALBETO use_fast=False preferido)
# ---------------------------
def load_tokenizer_and_model(model_key: str, model_ckpt: str):
    is_albert = "albert" in model_ckpt.lower()
    preferred_fast = (not is_albert)
    last_err = None
    for use_fast in [preferred_fast, (not preferred_fast)]:
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=use_fast)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_ckpt,
                num_labels=3,
                id2label={i: LABEL_NAME[i] for i in [0,1,2]},
                label2id={LABEL_NAME[i]: i for i in [0,1,2]},
            )
            return tokenizer, model, use_fast, "OK"
        except Exception as e:
            last_err = e
    return None, None, None, f"FAIL: {last_err}"

def tokenize_dataset(ds: DatasetDict, tokenizer, text_col: str):
    def tok(batch):
        return tokenizer(batch[text_col], truncation=True, max_length=MAX_LEN)
    return ds.map(tok, batched=True)


# ---------------------------
# 14) TRAIN / EVAL
# ---------------------------
def train_and_eval(model_name_key: str, model_ckpt: str, ds: DatasetDict, text_for_model: str):
    clear_gpu()
    print("\n==============================")
    print(f"Modelo: {model_name_key} | Escenario: {SCENARIO} | Texto: {text_for_model}")
    print(f"Checkpoint: {model_ckpt}")
    print(f"BATCH_SIZE={BATCH_SIZE} | GRAD_ACC={GRAD_ACC} | fp16={USE_FP16}")
    print("==============================")

    tokenizer, model, used_fast, status = load_tokenizer_and_model(model_name_key, model_ckpt)
    if tokenizer is None:
        print("❌ No se pudo cargar:", status)
        return {
            "scenario": SCENARIO, "text_for_model": text_for_model, "model": model_name_key,
            "accuracy": np.nan, "f1_macro": np.nan, "precision_macro": np.nan, "recall_macro": np.nan,
            "auc_macro_ovr": np.nan, "tokenizer_use_fast": None, "status": status
        }

    ds_tok = tokenize_dataset(ds, tokenizer, text_for_model)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    run_dir = os.path.join(REP_DIR, f"{SCENARIO}_{text_for_model}_{model_name_key}".replace(" ","_"))
    os.makedirs(run_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=run_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,                 # recomendado (ver explicación)
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACC,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        logging_steps=50,
        report_to="none",
        seed=SEED,
        fp16=USE_FP16,
        remove_unused_columns=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
    )

    trainer.train()

    test_metrics = trainer.evaluate(ds_tok["test"])
    preds_out = trainer.predict(ds_tok["test"])

    y_true = preds_out.label_ids
    y_pred = np.argmax(preds_out.predictions, axis=-1)

    # report
    report = classification_report(
        y_true, y_pred,
        target_names=[LABEL_NAME[i] for i in [0,1,2]],
        digits=4
    )

    with open(os.path.join(run_dir, "test_metrics.json"), "w", encoding="utf-8") as f:
        json.dump(test_metrics, f, ensure_ascii=False, indent=2)
    with open(os.path.join(run_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
        f.write(report)

    # Confusion matrix (azul y legible)
    cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
    cm_fname = f"cm_{SCENARIO}_{text_for_model}_{model_name_key}.png".replace(" ","_")
    plot_confusion_matrix_blue(cm, f"Matriz de confusión - {model_name_key} ({SCENARIO}) [{text_for_model}]", cm_fname)

    # ROC
    logits = preds_out.predictions
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    roc_fname = f"roc_{SCENARIO}_{text_for_model}_{model_name_key}.png".replace(" ","_")
    plot_multiclass_roc(y_true, probs, f"ROC - {model_name_key} ({SCENARIO}) [{text_for_model}]", roc_fname)

    try:
        y_bin = label_binarize(y_true, classes=[0,1,2])
        auc_macro = roc_auc_score(y_bin, probs, average="macro", multi_class="ovr")
    except Exception:
        auc_macro = np.nan

    return {
        "scenario": SCENARIO,
        "text_for_model": text_for_model,
        "model": model_name_key,
        "accuracy": float(test_metrics.get("eval_accuracy", np.nan)),
        "f1_macro": float(test_metrics.get("eval_f1_macro", np.nan)),
        "precision_macro": float(test_metrics.get("eval_precision_macro", np.nan)),
        "recall_macro": float(test_metrics.get("eval_recall_macro", np.nan)),
        "auc_macro_ovr": float(auc_macro),
        "tokenizer_use_fast": bool(used_fast),
        "status": "OK"
    }


# ---------------------------
# 15) RUN EXPERIMENTS
# ---------------------------
results = []
for model_name_key, ckpt in MODELS.items():
    for text_for_model in TEXT_VARIANTS_FOR_TRAIN:
        results.append(train_and_eval(model_name_key, ckpt, ds, text_for_model))

res_df = pd.DataFrame(results).sort_values(["text_for_model","f1_macro"], ascending=[True, False]).reset_index(drop=True)

print("\n==============================")
print("RESULTADOS FINALES (DESBALANCEADO):")
print(res_df[["scenario","text_for_model","model","accuracy","f1_macro","precision_macro","recall_macro","auc_macro_ovr","status"]])
print("==============================\n")

csv_path = os.path.join(OUTPUT_DIR, "resultados_desbalanceado.csv")
md_path  = os.path.join(OUTPUT_DIR, "resultados_desbalanceado.md")
res_df.to_csv(csv_path, index=False, encoding="utf-8")

try:
    md = res_df[["scenario","text_for_model","model","accuracy","f1_macro","precision_macro","recall_macro","auc_macro_ovr","status"]].to_markdown(index=False)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)
    print("\nTabla Markdown (guardada):", md_path)
except Exception as e:
    print("No se pudo generar MD:", e)

print("✅ CSV guardado:", csv_path)


# ---------------------------
# 16) ZIP (solo artefactos TFM, NO pesos)
# ---------------------------
zip_path = os.path.join("/content", ZIP_NAME)

INCLUDE_EXT = {".png", ".csv", ".md", ".txt", ".json"}

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(OUTPUT_DIR):
        for f in files:
            ext = os.path.splitext(f)[1].lower()
            if ext in INCLUDE_EXT:
                full = os.path.join(root, f)
                rel  = os.path.relpath(full, OUTPUT_DIR)
                z.write(full, arcname=os.path.join(os.path.basename(OUTPUT_DIR), rel))

print("\n✅ ZIP generado:", zip_path)
print("   En Colab: Files -> /content ->", ZIP_NAME)
print("✅ Figuras:", FIG_DIR)
print("✅ Reportes:", REP_DIR)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.3/566.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Unnamed: 0,full_text,text_tr,text_tr_noacc,label,label_name
0,"Sin ser fan de Juan Gabriel, siempre supe que ...",sin ser fan de juan gabriel siempre supe que e...,sin ser fan de juan gabriel siempre supe que e...,2,positivo
1,Sabes que no tendrás un buen día cuando lo pri...,sabes que no tendrás un buen día cuando lo pri...,sabes que no tendras un buen dia cuando lo pri...,0,negativo
2,"En situaciones en las que no sepas que hacer, ...",en situaciones en las que no sepas que hacer s...,en situaciones en las que no sepas que hacer s...,0,negativo
3,ayer preguntaban y dónde están las solteras!!!...,ayer preguntaban y dónde están las solteras to...,ayer preguntaban y donde estan las solteras to...,1,neutro
4,Que el finde sea para hacer cualquier cosa que...,que el finde sea para hacer cualquier cosa que...,que el finde sea para hacer cualquier cosa que...,1,neutro
5,En el examen de geometría me estoy esforzando ...,en el examen de geometría me estoy esforzando ...,en el examen de geometria me estoy esforzando ...,1,neutro
6,"Elige amar, así duela, así parezca un imposibl...",elige amar así duela así parezca un imposible ...,elige amar asi duela asi parezca un imposible ...,1,neutro
7,Mi mamá compra pollo a la brasa de almuerzo y ...,mi mamá compra pollo a la brasa de almuerzo y ...,mi mama compra pollo a la brasa de almuerzo y ...,0,negativo



📌 HEAD TRAIN (desbalanceado):


Unnamed: 0,full_text,text_tr,text_tr_noacc,label,label_name
0,¡que la rata nos proteja!mi columna de hoy en ...,que la rata nos proteja mi columna de hoy en d...,que la rata nos proteja mi columna de hoy en d...,0,negativo
1,periodismoperu: ❗ no va más ❗ #danielmora ya n...,periodismoperu no va más ya no postula al por ...,periodismoperu no va mas ya no postula al por ...,0,negativo
2,aunque te revientes '@costagino voy a votar po...,aunque te revientes voy a votar por la estrell...,aunque te revientes voy a votar por la estrell...,2,positivo
3,#zenaidasolis del #partidomorado otra que se d...,del otra que se declara ahora anti fujimorista...,del otra que se declara ahora anti fujimorista...,0,negativo
4,@vivianxhv se alejaron fue doloroso. Zayn dice...,se alejaron fue doloroso zayn dice que se habl...,se alejaron fue doloroso zayn dice que se habl...,0,negativo
5,fujimorista ratas fuera de aqui rosabartra v...,fujimorista ratas fuera de aqui rosabartra vit...,fujimorista ratas fuera de aqui rosabartra vit...,0,negativo
6,duchodepe: ¿quién está detrás de la debacle de...,duchodepe quién está detrás de la debacle del ...,duchodepe quien esta detras de la debacle del ...,0,negativo
7,Soy una mrd de persona con buenos deseos,soy una mierda de persona con buenos deseos,soy una mierda de persona con buenos deseos,1,neutro


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Modelo: DistilBERT-es | Escenario: desbalanceado | Texto: text_tr
Checkpoint: dccuchile/distilbert-base-spanish-uncased
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


tokenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/269M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/distilbert-base-spanish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.7532,0.750664,0.682386,0.604357,0.679019,0.613949
2,0.5922,0.697754,0.712899,0.692227,0.691944,0.696422
3,0.4682,0.699775,0.719834,0.692246,0.70007,0.688248
4,0.3744,0.721872,0.718447,0.694248,0.698606,0.691882







Modelo: DistilBERT-es | Escenario: desbalanceado | Texto: text_tr_noacc
Checkpoint: dccuchile/distilbert-base-spanish-uncased
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/distilbert-base-spanish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.7682,0.739889,0.693481,0.629041,0.705202,0.63109
2,0.6134,0.674494,0.708738,0.683858,0.68497,0.685584
3,0.4528,0.68778,0.726768,0.700568,0.710313,0.695881
4,0.3611,0.704503,0.719834,0.697261,0.700601,0.694772







Modelo: ALBETO Tiny | Escenario: desbalanceado | Texto: text_tr
Checkpoint: dccuchile/albert-tiny-spanish
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuchile/albert-tiny-spanish and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.9321,0.886999,0.622746,0.521,0.622033,0.543305
2,0.8067,0.818526,0.635229,0.567053,0.586649,0.583428
3,0.7125,0.787537,0.658807,0.600816,0.618871,0.603266
4,0.6791,0.781083,0.654646,0.598024,0.614428,0.601217







Modelo: ALBETO Tiny | Escenario: desbalanceado | Texto: text_tr_noacc
Checkpoint: dccuchile/albert-tiny-spanish
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuchile/albert-tiny-spanish and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.9591,0.938842,0.556172,0.433244,0.568509,0.464288
2,0.8198,0.829103,0.653259,0.601518,0.624146,0.606182
3,0.7341,0.803899,0.672677,0.625157,0.661887,0.622003
4,0.6975,0.791612,0.669903,0.627285,0.649527,0.624371







Modelo: MiniLM Multilingual | Escenario: desbalanceado | Texto: text_tr
Checkpoint: microsoft/Multilingual-MiniLM-L12-H384
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.9817,0.937945,0.590846,0.431873,0.387379,0.502209
2,0.8661,0.852394,0.619972,0.465127,0.404815,0.547844
3,0.8023,0.831784,0.632455,0.507176,0.575507,0.566595
4,0.7785,0.840582,0.631068,0.515332,0.584295,0.572001


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])





Modelo: MiniLM Multilingual | Escenario: desbalanceado | Texto: text_tr_noacc
Checkpoint: microsoft/Multilingual-MiniLM-L12-H384
BATCH_SIZE=16 | GRAD_ACC=1 | fp16=False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.9556,0.889179,0.61165,0.45223,0.400747,0.526363
2,0.8418,0.876755,0.600555,0.479402,0.521535,0.550734
3,0.7812,0.81784,0.63939,0.537617,0.605062,0.577723
4,0.7494,0.814008,0.653259,0.567443,0.628121,0.59673


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])





RESULTADOS FINALES (DESBALANCEADO):
        scenario text_for_model                model  accuracy  f1_macro  \
0  desbalanceado        text_tr        DistilBERT-es  0.759003  0.740672   
1  desbalanceado        text_tr          ALBETO Tiny  0.689751  0.637341   
2  desbalanceado        text_tr  MiniLM Multilingual  0.655125  0.539884   
3  desbalanceado  text_tr_noacc        DistilBERT-es  0.757618  0.733786   
4  desbalanceado  text_tr_noacc          ALBETO Tiny  0.695291  0.650411   
5  desbalanceado  text_tr_noacc  MiniLM Multilingual  0.675900  0.588735   

   precision_macro  recall_macro  auc_macro_ovr status  
0         0.743067      0.738655       0.890446     OK  
1         0.664072      0.637165       0.833209     OK  
2         0.642436      0.592050       0.813958     OK  
3         0.740841      0.729671       0.891670     OK  
4         0.665917      0.650655       0.836964     OK  
5         0.653066      0.615426       0.824920     OK  


Tabla Markdown (guardada): /c

# ✅ CÓDIGO 2 (COLAB) — BALANCEADO

Es el mismo, pero aquí se hace undersampling antes del split.

In [None]:
# ============================================================
# TFM - NLP Sentimiento Político (Transformers ligeros)
# NOTEBOOK 2/2: BALANCEADO
# Google Colab - transformers==4.44.2
#
# (Mismas reglas que el desbalanceado)
# ============================================================

!pip -q install -U "transformers==4.44.2" datasets evaluate accelerate scikit-learn wordcloud ftfy

import os, re, json, gc, random, zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from ftfy import fix_text
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, roc_auc_score

import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    set_seed, EarlyStoppingCallback
)
from wordcloud import WordCloud


# ---------------------------
# 1) CONFIG
# ---------------------------
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

GITHUB_RAW_URL = "https://raw.githubusercontent.com/danielalva2008/TFM/master/TFM_UNIR/DATASETS/2.-Dataset%20Trabajados/data_extended.csv"

SCENARIO = "balanceado"
ZIP_NAME = "TFM_Balanceado_outputs.zip"

OUTPUT_DIR = "/content/outputs_balanceado"
FIG_DIR    = os.path.join(OUTPUT_DIR, "figures")
REP_DIR    = os.path.join(OUTPUT_DIR, "reports")
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REP_DIR, exist_ok=True)

MODELS = {
    "DistilBERT-es": "dccuchile/distilbert-base-spanish-uncased",
    "ALBETO Tiny": "dccuchile/albert-tiny-spanish",
    "MiniLM Multilingual": "microsoft/Multilingual-MiniLM-L12-H384",
}

TEXT_COL  = "full_text"
LABEL_COL = "label"
LABEL_NAME = {0: "negativo", 1: "neutro", 2: "positivo"}
VALID_LABELS = set(LABEL_NAME.keys())

CLASS_COLORS = {0: "red", 1: "gray", 2: "green"}

MAX_LEN = 128
TEST_SIZE = 0.15
VAL_SIZE  = 0.15

BATCH_SIZE = 16
GRAD_ACC = 1
#TARGET_EFFECTIVE_BATCH = 32
#GRAD_ACC = max(1, TARGET_EFFECTIVE_BATCH // BATCH_SIZE)

EPOCHS = 4
LR = 2e-5
USE_FP16 = torch.cuda.is_available()
EARLY_STOPPING_PATIENCE = 2

TEXT_VARIANTS_FOR_TRAIN = ["text_tr", "text_tr_noacc"]

TFIDF_CHI2_BANNED = {"marca", "numero", "número", "escribe"}


# ---------------------------
# 2) HELPERS
# ---------------------------
def clear_gpu():
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    except Exception:
        pass

def save_fig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=220, bbox_inches="tight")
    plt.close()

def fix_text_utf8(s: str) -> str:
    return fix_text(str(s))

def remove_accents(text: str) -> str:
    pairs = [('á','a'), ('é','e'), ('í','i'), ('ó','o'), ('ú','u'), ('ü','u')]
    t = str(text)
    for a, b in pairs:
        t = t.replace(a, b).replace(a.upper(), b.upper())
    return t

SLANG = [
    (r"\bmrd\b", "mierda"),
    (r"\bd\b", "de"),
    (r"\b[qk]\b", "que"),
    (r"\bxo\b", "pero"),
    (r"\bxa\b", "para"),
    (r"\b[xp]q\b", "porque"),
    (r"\bes[qk]\b", "es que"),
    (r"\bfvr\b", "favor"),
    (r"\b(xfa|xf|pf|plis|pls|porfa)\b", "por favor"),
    (r"\bdnd\b", "donde"),
    (r"\btb\b", "también"),
    (r"\b(tq|tk)\b", "te quiero"),
    (r"\b(tqm|tkm)\b", "te quiero mucho"),
    (r"\bx\b", "por"),
    (r"\+", "mas"),
    (r"\bpiña\b", "mala suerte"),
    (r"\bagarre\b", "adulterio"),
    (r"\bampay\b", "verguenza"),
    (r"\bbacan\b", "alegria"),
    (r"\bbamba\b", "falsificado"),
    (r"\bcabeceador\b", "ladron"),
    (r"\bcabro\b", "homosexual"),
    (r"\bcachaciento\b", "burlon"),
    (r"\bcalabacita\b", "tonta"),
    (r"\bcaleta\b", "secreto"),
    (r"\bcana\b", "carcel"),
    (r"\bchucha\b", "molestia"),
    (r"\bchoro\b", "ladron"),
    (r"\bconchán\b", "conchudo"),
    (r"\bcutra\b", "ilicito"),
    (r"\bdark\b", "horrible"),
    (r"\blenteja\b", "torpe"),
    (r"\blorna\b", "tonto"),
    (r"\bmancar\b", "morir"),
    (r"\bmonse\b", "tonto"),
    (r"\bpiñata\b", "mala suerte"),
]

def apply_slang(t: str) -> str:
    for pat, rep in SLANG:
        t = re.sub(pat, rep, t)
    return t

stop_words = set([
    'a', 'adelante', 'además', 'afirmó', 'agregó', 'ahi', 'ahora', 'ahí', 'al',
    'algo', 'alguna', 'algunas', 'alguno', 'algunos', 'algún', 'alrededor', 'ambos',
    'ampleamos', 'ante', 'anterior', 'antes', 'apenas', 'aproximadamente', 'aquel',
    'aquellas', 'aquellos', 'aqui', 'aquí', 'arriba', 'as', 'aseguró', 'asi', 'así',
    'atras', 'aunque', 'aún', 'ayer', 'b', 'bajo', 'bastante', 'bien', 'buen',
    'buena', 'buenas', 'bueno', 'buenos', 'c', 'cada', 'casi', 'cc', 'cerca',
    'cierta', 'ciertas', 'cierto', 'ciertos', 'cinco', 'comentó', 'como', 'con',
    'conocer', 'conseguimos', 'conseguir', 'considera', 'consideró', 'consigue',
    'consiguen', 'consigues', 'consigo', 'contra', 'cosas', 'creo', 'cual',
    'cuales', 'cualquier', 'cuando', 'cuanto', 'cuatro', 'cuenta', 'cómo', 'd',
    'da', 'dado', 'dan', 'dar', 'de', 'debe', 'deben', 'debido', 'decir', 'dejó',
    'del', 'demás', 'dentro', 'desde', 'después', 'dia', 'dice', 'dicen', 'dicho',
    'dieron', 'diferente', 'diferentes', 'dijeron', 'dijo', 'dio', 'do', 'don',
    'donde', 'dos', 'durante', 'e', 'ejemplo', 'el', 'ella', 'ellas', 'ello',
    'ellos', 'embargo', 'empleais', 'emplean', 'emplear', 'empleas', 'empleo',
    'en', 'encima', 'encuentra', 'entonces', 'entre', 'era', 'eramos', 'eran',
    'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba',
    'estaban', 'estado', 'estais', 'estamos', 'estan', 'estar', 'estará', 'estas',
    'este', 'esto', 'estos', 'estoy', 'estuvo', 'ex', 'existe', 'existen',
    'explicó', 'expresó', 'f', 'fin', 'fue', 'fuera', 'fueron', 'fui', 'fuimos',
    'g', 'gran', 'grandes', 'gueno', 'h', 'ha', 'haber', 'había', 'habían',
    'habrá', 'hace', 'haceis', 'hacemos', 'hacen', 'hacer', 'hacerlo', 'haces',
    'hacia', 'haciendo', 'hago', 'han', 'has', 'hasta', 'hay', 'haya', 'he',
    'hecho', 'hemos', 'hicieron', 'hizo', 'hoy', 'hubo', 'i', 'igual', 'incluso',
    'indicó', 'informó', 'intenta', 'intentais', 'intentamos', 'intentan',
    'intentar', 'intentas', 'intento', 'ir', 'is', 'j', 'junto', 'k', 'l', 'la',
    'lado', 'largo', 'las', 'le', 'les', 'llegó', 'lleva', 'llevar', 'lo', 'los',
    'luego', 'lugar', 'm', 'manera', 'manifestó', 'mas', 'mayor', 'me', 'mediante',
    'mejor', 'mencionó', 'menos', 'mi', 'mientras', 'mio', 'mis', 'misma',
    'mismas', 'mismo', 'mismos', 'mo', 'modo', 'momento', 'mucha', 'muchas',
    'mucho', 'muchos', 'muy', 'más', 'n', 'nada', 'nadie', 'ni', 'ninguna',
    'ningunas', 'ninguno', 'ningunos', 'ningún', 'nos', 'nosotras', 'nosotros',
    'nuestra', 'nuestras', 'nuestro', 'nuestros', 'nueva', 'nuevas', 'nuevo',
    'nuevos', 'nunca', 'o', 'ocho', 'of', 'otra', 'otras', 'otro', 'otros',
    'p', 'pa', 'para', 'parece', 'parte', 'partir', 'pasada', 'pasado', 'pero',
    'pesar', 'poca', 'pocas', 'poco', 'pocos', 'podeis', 'podemos', 'poder',
    'podra', 'podrán', 'podria', 'podriais', 'podriamos', 'podrian', 'podrias',
    'podría', 'podrían', 'poner', 'por', 'porque', 'por qué', 'posible', 'pq',
    'primer', 'primera', 'primero', 'primeros', 'principalmente', 'propia',
    'propias', 'propio', 'propios', 'próximo', 'próximos', 'pt', 'pudo', 'pueda',
    'puede', 'pueden', 'puedo', 'pues', 'q', 'qie', 'qu', 'que', 'quedó',
    'queremos', 'quien', 'quienes', 'quiere', 'qué', 'quién', 'r', 'realizado',
    'realizar', 'realizó', 'respecto', 's', 'sabe', 'sabeis', 'sabemos', 'saben',
    'saber', 'sabes', 'se', 'sea', 'sean', 'segunda', 'segundo', 'según', 'seis',
    'sera', 'será', 'serán', 'sería', 'si', 'sido', 'siempre', 'siendo', 'siete',
    'sigue', 'siguiente', 'sin', 'sino', 'sobre', 'sois', 'sola', 'solamente',
    'solas', 'solo', 'solos', 'somos', 'son', 'soy', 'su', 'sua', 'sus', 'sí',
    'sólo', 'señaló', 't', 'tal', 'también', 'tampoco', 'tan', 'tanto', 'te',
    'tendrá', 'tendrán', 'teneis', 'tenemos', 'tener', 'tenga', 'tengo', 'tenía',
    'tenido', 'tercera', 'the', 'ti', 'tiempo', 'tiene', 'tienen', 'to', 'toda',
    'todas', 'todavía', 'todo', 'todos', 'total', 'trabaja', 'trabajais',
    'trabajamos', 'trabajan', 'trabajar', 'trabajas', 'trabajo', 'tras',
    'trata', 'través', 'tres', 'tu', 'tus', 'tuvo', 'tuyo', 'u', 'un', 'una',
    'unas', 'uno', 'unos', 'us', 'usa', 'usais', 'usamos', 'usan', 'usar',
    'usas', 'uso', 'usted', 'v', 'va', 'vais', 'valor', 'vamos', 'van',
    'varias', 'varios', 'vaya', 'veces', 'ven', 'ver', 'verdad', 'verdadera',
    'verdadero', 'vez', 'via', 'vosotras', 'vosotros', 'voy', 'w', 'x', 'y',
    'ya', 'yo', 'z', 'á', 'ésta', 'éste', 'éstas', 'éstos', 'última', 'últimas',
    'ultimo', 'último', 'últimos'
])

def normalize_for_transformer(text: str, remove_tilde: bool = False) -> str:
    t = fix_text_utf8(text).lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"#\w+", " ", t)
    t = apply_slang(t)
    t = re.sub(r"[^a-záéíóúñü\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    if remove_tilde:
        t = remove_accents(t)
        t = re.sub(r"[^a-zñ\s]", " ", t)
        t = re.sub(r"\s+", " ", t).strip()
    return t

def eda_aggressive(text: str) -> str:
    t = fix_text_utf8(text).lower()
    t = apply_slang(t)
    t = remove_accents(t)
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"#\w+", " ", t)
    t = re.sub(r"[^a-zñ\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    toks = [w for w in t.split() if (len(w) > 2 and w not in stop_words)]
    return " ".join(toks)

def term_allowed_for_tfidf(term: str) -> bool:
    parts = term.split()
    return not any(p in TFIDF_CHI2_BANNED for p in parts)


# ---------------------------
# 3) CARGA DATASET
# ---------------------------
print("📥 Leyendo CSV desde GitHub RAW ...")
df = pd.read_csv(GITHUB_RAW_URL)

df = df[[TEXT_COL, LABEL_COL]].copy()
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df[df[TEXT_COL].str.len() > 0]

df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors="coerce")
df = df.dropna(subset=[LABEL_COL])
df[LABEL_COL] = df[LABEL_COL].astype(int)
df = df[df[LABEL_COL].isin(VALID_LABELS)].reset_index(drop=True)

df = df.drop_duplicates(subset=[TEXT_COL]).reset_index(drop=True)

df["text_tr"] = df[TEXT_COL].apply(lambda x: normalize_for_transformer(x, remove_tilde=False))
df["text_tr_noacc"] = df[TEXT_COL].apply(lambda x: normalize_for_transformer(x, remove_tilde=True))
df["text_eda"] = df[TEXT_COL].apply(eda_aggressive)
df["label_name"] = df[LABEL_COL].map(LABEL_NAME)

print("✅ Shape final:", df.shape)
print("Distribución labels (ORIGINAL):")
print(df[LABEL_COL].value_counts().sort_index())

print("\n🔎 EJEMPLO (raw vs text_tr vs text_tr_noacc):")
display(df[[TEXT_COL, "text_tr", "text_tr_noacc", LABEL_COL, "label_name"]].head(8))


# ---------------------------
# 4) BALANCEO (undersampling)
# ---------------------------
def make_balanced_undersampling(df_in: pd.DataFrame, label_col: str, seed=42) -> pd.DataFrame:
    counts = df_in[label_col].value_counts()
    n_min = counts.min()
    parts = []
    for lab in sorted(counts.index):
        parts.append(df_in[df_in[label_col] == lab].sample(n=n_min, random_state=seed))
    return pd.concat(parts).sample(frac=1, random_state=seed).reset_index(drop=True)

df_bal = make_balanced_undersampling(df, LABEL_COL, seed=SEED)

print("\nDistribución labels (BALANCEADO):")
print(df_bal[LABEL_COL].value_counts().sort_index())

print("\n📌 HEAD BALANCEADO:")
display(df_bal[[TEXT_COL, "text_tr", "text_tr_noacc", LABEL_COL, "label_name"]].head(8))


# ---------------------------
# 5) SPLITS (BALANCEADO)
# ---------------------------
def split_stratified(df_in: pd.DataFrame, label_col: str, test_size=0.15, val_size=0.15, seed=42):
    df_train, df_temp = train_test_split(
        df_in, test_size=(test_size + val_size),
        stratify=df_in[label_col], random_state=seed
    )
    rel_test = test_size / (test_size + val_size)
    df_val, df_test = train_test_split(
        df_temp, test_size=rel_test,
        stratify=df_temp[label_col], random_state=seed
    )
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

train_df, val_df, test_df = split_stratified(df_bal, LABEL_COL, TEST_SIZE, VAL_SIZE, SEED)


# ---------------------------
# 6) EDA FIGURES (distribución + comparación)
# ---------------------------
def plot_class_distribution(df_in: pd.DataFrame, title: str, fname: str):
    counts = df_in[LABEL_COL].value_counts().sort_index()
    labels = [LABEL_NAME[i].capitalize() for i in counts.index]
    colors = [CLASS_COLORS.get(i, "blue") for i in counts.index]
    plt.figure(figsize=(6,4))
    plt.bar(labels, counts.values, color=colors)
    plt.title(title)
    plt.ylabel("Número de tuits")
    plt.xlabel("Etiqueta")
    save_fig(os.path.join(FIG_DIR, fname))

def plot_class_comparison(df_orig: pd.DataFrame, df_balanced: pd.DataFrame, title: str, fname: str):
    idx = [0,1,2]
    orig_counts = df_orig[LABEL_COL].value_counts().reindex(idx, fill_value=0)
    bal_counts  = df_balanced[LABEL_COL].value_counts().reindex(idx, fill_value=0)
    labels = [LABEL_NAME[i].capitalize() for i in idx]
    x = np.arange(len(labels))
    width = 0.35
    plt.figure(figsize=(8,5))
    plt.bar(x - width/2, orig_counts.values, width, label="Original")
    plt.bar(x + width/2, bal_counts.values,  width, label="Balanceado")
    plt.title(title)
    plt.xlabel("Etiqueta")
    plt.ylabel("Número de tuits")
    plt.xticks(x, labels)
    plt.legend()
    save_fig(os.path.join(FIG_DIR, fname))

plot_class_distribution(df, "Distribución de clases (original)", "01_dist_original.png")
plot_class_distribution(df_bal, "Distribución de clases (balanceado)", "02_dist_balanceado.png")
plot_class_comparison(df, df_bal, "Comparación de distribución de clases", "03_comp_dist_original_vs_balanceado.png")

def plot_wordclouds(df_in: pd.DataFrame, text_col: str, prefix: str):
    for lab in [0,1,2]:
        texts = df_in[df_in[LABEL_COL]==lab][text_col].astype(str).tolist()
        joined = " ".join(texts).strip()
        if not joined:
            continue
        wc = WordCloud(width=1200, height=600, background_color="white").generate(joined)
        plt.figure(figsize=(12,6))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"{prefix} - WordCloud ({LABEL_NAME[lab]})")
        save_fig(os.path.join(FIG_DIR, f"04_{SCENARIO}_wordcloud_{lab}.png"))

plot_wordclouds(df_bal, "text_eda", "Dataset balanceado (EDA)")

def tfidf_top_by_class(df_in: pd.DataFrame, text_col="text_eda", top_n=20, ngram_range=(1,2), min_df=3):
    texts = df_in[text_col].astype(str).tolist()
    y = df_in[LABEL_COL].astype(int).values
    vec = TfidfVectorizer(lowercase=True, ngram_range=ngram_range, min_df=min_df)
    X = vec.fit_transform(texts)
    terms = np.array(vec.get_feature_names_out())
    out = {}
    for lab in [0,1,2]:
        idx = np.where(y==lab)[0]
        mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
        order = np.argsort(mean_tfidf)[::-1]
        picked = []
        for j in order:
            term = terms[j]
            if term_allowed_for_tfidf(term):
                picked.append((term, float(mean_tfidf[j])))
            if len(picked) >= top_n:
                break
        out[lab] = picked
    return out

def chi2_top_by_class(df_in: pd.DataFrame, text_col="text_eda", top_n=20, ngram_range=(1,2), min_df=3):
    texts = df_in[text_col].astype(str).tolist()
    y = df_in[LABEL_COL].astype(int).values
    vec = TfidfVectorizer(lowercase=True, ngram_range=ngram_range, min_df=min_df)
    X = vec.fit_transform(texts)
    terms = np.array(vec.get_feature_names_out())
    out = {}
    for lab in [0,1,2]:
        scores, _ = chi2(X, (y==lab).astype(int))
        order = np.argsort(scores)[::-1]
        picked = []
        for j in order:
            term = terms[j]
            if term_allowed_for_tfidf(term):
                picked.append((term, float(scores[j])))
            if len(picked) >= top_n:
                break
        out[lab] = picked
    return out

def plot_kw_dict(kw_dict, title_prefix, fname_prefix):
    for lab in [0,1,2]:
        items = kw_dict.get(lab, [])
        if not items:
            continue
        words = [w for w,s in items]
        vals  = [s for w,s in items]
        plt.figure(figsize=(7,4))
        plt.bar(words, vals, color=CLASS_COLORS.get(lab, "blue"))
        plt.title(f"{title_prefix} ({LABEL_NAME[lab]})")
        plt.xticks(rotation=75, ha="right")
        save_fig(os.path.join(FIG_DIR, f"{fname_prefix}_{lab}.png"))

kw_tfidf = tfidf_top_by_class(df_bal, "text_eda", top_n=20)
plot_kw_dict(kw_tfidf, "TF-IDF Top 20 - balanceado", "05_balanceado_tfidf_top20")
kw_chi2 = chi2_top_by_class(df_bal, "text_eda", top_n=20)
plot_kw_dict(kw_chi2, "TF-IDF + χ² Top 20 - balanceado", "06_balanceado_chi2_top20")


# ---------------------------
# 7) HF DATASETS (solo columnas necesarias)
# ---------------------------
def to_hf_dataset(df_train, df_val, df_test):
    keep_cols = [LABEL_COL] + TEXT_VARIANTS_FOR_TRAIN
    return DatasetDict(
        train=Dataset.from_pandas(df_train[keep_cols].reset_index(drop=True)),
        validation=Dataset.from_pandas(df_val[keep_cols].reset_index(drop=True)),
        test=Dataset.from_pandas(df_test[keep_cols].reset_index(drop=True)),
    )

ds = to_hf_dataset(train_df, val_df, test_df)


# ---------------------------
# 8) MÉTRICAS
# ---------------------------
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall_macro": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
    }


# ---------------------------
# 9) CM + ROC
# ---------------------------
def plot_confusion_matrix_blue(cm, title, fname):
    plt.figure(figsize=(6,5))
    plt.imshow(cm, interpolation="nearest", cmap="Blues")
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(3)
    plt.xticks(tick_marks, [LABEL_NAME[i] for i in [0,1,2]])
    plt.yticks(tick_marks, [LABEL_NAME[i] for i in [0,1,2]])
    thresh = cm.max()/2.0 if cm.max()>0 else 0.5
    for i in range(3):
        for j in range(3):
            plt.text(j, i, int(cm[i,j]),
                     ha="center", va="center",
                     color="white" if cm[i,j] > thresh else "black",
                     fontsize=12, fontweight="bold")
    plt.ylabel("Real")
    plt.xlabel("Predicción")
    save_fig(os.path.join(FIG_DIR, fname))

def plot_multiclass_roc(y_true, y_proba, title, fname):
    classes = [0,1,2]
    y_bin = label_binarize(y_true, classes=classes)
    plt.figure(figsize=(7,6))
    for i, lab in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{LABEL_NAME[lab]} (AUC={roc_auc:.3f})")
    fpr_micro, tpr_micro, _ = roc_curve(y_bin.ravel(), y_proba.ravel())
    auc_micro = auc(fpr_micro, tpr_micro)
    plt.plot(fpr_micro, tpr_micro, linestyle="--", label=f"micro-avg (AUC={auc_micro:.3f})")
    plt.plot([0,1],[0,1], linestyle=":", linewidth=1)
    plt.xlim([0.0,1.0]); plt.ylim([0.0,1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    save_fig(os.path.join(FIG_DIR, fname))


# ---------------------------
# 10) LOAD + TOKENIZE (ALBETO safe)
# ---------------------------
def load_tokenizer_and_model(model_key: str, model_ckpt: str):
    is_albert = "albert" in model_ckpt.lower()
    preferred_fast = (not is_albert)
    last_err = None
    for use_fast in [preferred_fast, (not preferred_fast)]:
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=use_fast)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_ckpt,
                num_labels=3,
                id2label={i: LABEL_NAME[i] for i in [0,1,2]},
                label2id={LABEL_NAME[i]: i for i in [0,1,2]},
            )
            return tokenizer, model, use_fast, "OK"
        except Exception as e:
            last_err = e
    return None, None, None, f"FAIL: {last_err}"

def tokenize_dataset(ds: DatasetDict, tokenizer, text_col: str):
    def tok(batch):
        return tokenizer(batch[text_col], truncation=True, max_length=MAX_LEN)
    return ds.map(tok, batched=True)


# ---------------------------
# 11) TRAIN / EVAL
# ---------------------------
def train_and_eval(model_name_key: str, model_ckpt: str, ds: DatasetDict, text_for_model: str):
    clear_gpu()
    print("\n==============================")
    print(f"Modelo: {model_name_key} | Escenario: {SCENARIO} | Texto: {text_for_model}")
    print(f"BATCH_SIZE={BATCH_SIZE} | GRAD_ACC={GRAD_ACC} | fp16={USE_FP16}")
    print("==============================")

    tokenizer, model, used_fast, status = load_tokenizer_and_model(model_name_key, model_ckpt)
    if tokenizer is None:
        print("❌ No se pudo cargar:", status)
        return {
            "scenario": SCENARIO, "text_for_model": text_for_model, "model": model_name_key,
            "accuracy": np.nan, "f1_macro": np.nan, "precision_macro": np.nan, "recall_macro": np.nan,
            "auc_macro_ovr": np.nan, "tokenizer_use_fast": None, "status": status
        }

    ds_tok = tokenize_dataset(ds, tokenizer, text_for_model)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    run_dir = os.path.join(REP_DIR, f"{SCENARIO}_{text_for_model}_{model_name_key}".replace(" ","_"))
    os.makedirs(run_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=run_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACC,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        logging_steps=50,
        report_to="none",
        seed=SEED,
        fp16=USE_FP16,
        remove_unused_columns=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
    )

    trainer.train()
    test_metrics = trainer.evaluate(ds_tok["test"])
    preds_out = trainer.predict(ds_tok["test"])

    y_true = preds_out.label_ids
    y_pred = np.argmax(preds_out.predictions, axis=-1)

    report = classification_report(
        y_true, y_pred,
        target_names=[LABEL_NAME[i] for i in [0,1,2]],
        digits=4
    )

    with open(os.path.join(run_dir, "test_metrics.json"), "w", encoding="utf-8") as f:
        json.dump(test_metrics, f, ensure_ascii=False, indent=2)
    with open(os.path.join(run_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
        f.write(report)

    cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
    cm_fname = f"cm_{SCENARIO}_{text_for_model}_{model_name_key}.png".replace(" ","_")
    plot_confusion_matrix_blue(cm, f"Matriz de confusión - {model_name_key} ({SCENARIO}) [{text_for_model}]", cm_fname)

    logits = preds_out.predictions
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    roc_fname = f"roc_{SCENARIO}_{text_for_model}_{model_name_key}.png".replace(" ","_")
    plot_multiclass_roc(y_true, probs, f"ROC - {model_name_key} ({SCENARIO}) [{text_for_model}]", roc_fname)

    try:
        y_bin = label_binarize(y_true, classes=[0,1,2])
        auc_macro = roc_auc_score(y_bin, probs, average="macro", multi_class="ovr")
    except Exception:
        auc_macro = np.nan

    return {
        "scenario": SCENARIO,
        "text_for_model": text_for_model,
        "model": model_name_key,
        "accuracy": float(test_metrics.get("eval_accuracy", np.nan)),
        "f1_macro": float(test_metrics.get("eval_f1_macro", np.nan)),
        "precision_macro": float(test_metrics.get("eval_precision_macro", np.nan)),
        "recall_macro": float(test_metrics.get("eval_recall_macro", np.nan)),
        "auc_macro_ovr": float(auc_macro),
        "tokenizer_use_fast": bool(used_fast),
        "status": "OK"
    }

results = []
for model_name_key, ckpt in MODELS.items():
    for text_for_model in TEXT_VARIANTS_FOR_TRAIN:
        results.append(train_and_eval(model_name_key, ckpt, ds, text_for_model))

res_df = pd.DataFrame(results).sort_values(["text_for_model","f1_macro"], ascending=[True, False]).reset_index(drop=True)

print("\n==============================")
print("RESULTADOS FINALES (BALANCEADO):")
print(res_df[["scenario","text_for_model","model","accuracy","f1_macro","precision_macro","recall_macro","auc_macro_ovr","status"]])
print("==============================\n")

csv_path = os.path.join(OUTPUT_DIR, "resultados_balanceado.csv")
md_path  = os.path.join(OUTPUT_DIR, "resultados_balanceado.md")
res_df.to_csv(csv_path, index=False, encoding="utf-8")

try:
    md = res_df[["scenario","text_for_model","model","accuracy","f1_macro","precision_macro","recall_macro","auc_macro_ovr","status"]].to_markdown(index=False)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)
    print("\nTabla Markdown (guardada):", md_path)
except Exception as e:
    print("No se pudo generar MD:", e)

print("✅ CSV guardado:", csv_path)

zip_path = os.path.join("/content", ZIP_NAME)
INCLUDE_EXT = {".png", ".csv", ".md", ".txt", ".json"}

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(OUTPUT_DIR):
        for f in files:
            ext = os.path.splitext(f)[1].lower()
            if ext in INCLUDE_EXT:
                full = os.path.join(root, f)
                rel  = os.path.relpath(full, OUTPUT_DIR)
                z.write(full, arcname=os.path.join(os.path.basename(OUTPUT_DIR), rel))

print("\n✅ ZIP generado:", zip_path)
print("   En Colab: Files -> /content ->", ZIP_NAME)
print("✅ Figuras:", FIG_DIR)
print("✅ Reportes:", REP_DIR)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.3/566.3 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Unnamed: 0,full_text,text_tr,text_tr_noacc,label,label_name
0,"Sin ser fan de Juan Gabriel, siempre supe que ...",sin ser fan de juan gabriel siempre supe que e...,sin ser fan de juan gabriel siempre supe que e...,2,positivo
1,Sabes que no tendrás un buen día cuando lo pri...,sabes que no tendrás un buen día cuando lo pri...,sabes que no tendras un buen dia cuando lo pri...,0,negativo
2,"En situaciones en las que no sepas que hacer, ...",en situaciones en las que no sepas que hacer s...,en situaciones en las que no sepas que hacer s...,0,negativo
3,ayer preguntaban y dónde están las solteras!!!...,ayer preguntaban y dónde están las solteras to...,ayer preguntaban y donde estan las solteras to...,1,neutro
4,Que el finde sea para hacer cualquier cosa que...,que el finde sea para hacer cualquier cosa que...,que el finde sea para hacer cualquier cosa que...,1,neutro
5,En el examen de geometría me estoy esforzando ...,en el examen de geometría me estoy esforzando ...,en el examen de geometria me estoy esforzando ...,1,neutro
6,"Elige amar, así duela, así parezca un imposibl...",elige amar así duela así parezca un imposible ...,elige amar asi duela asi parezca un imposible ...,1,neutro
7,Mi mamá compra pollo a la brasa de almuerzo y ...,mi mamá compra pollo a la brasa de almuerzo y ...,mi mama compra pollo a la brasa de almuerzo y ...,0,negativo



Distribución labels (BALANCEADO):
label
0    1250
1    1250
2    1250
Name: count, dtype: int64

📌 HEAD BALANCEADO:


Unnamed: 0,full_text,text_tr,text_tr_noacc,label,label_name
0,brunoascenzo #partidomorado el 6 y estoy pensa...,brunoascenzo el y estoy pensando entre el u ot...,brunoascenzo el y estoy pensando entre el u ot...,1,neutro
1,del doble cañon a la doble cara. ¡que ridiculo...,del doble cañon a la doble cara que ridiculo m...,del doble cañon a la doble cara que ridiculo m...,0,negativo
2,@RicardoMoran '@larepublica_pe me hace muy feliz,me hace muy feliz,me hace muy feliz,2,positivo
3,fuerzapopular con 13 la gente no aprende e...,fuerzapopular con la gente no aprende en serio...,fuerzapopular con la gente no aprende en serio...,0,negativo
4,opounidad clarísima para el #partidomorado de ...,opounidad clarísima para el de demostrar qué c...,opounidad clarisima para el de demostrar que c...,2,positivo
5,la candidata de fuerzapopular con el numero u...,la candidata de fuerzapopular con el numero un...,la candidata de fuerzapopular con el numero un...,1,neutro
6,asco y repugnancia dan estas 💩💩 d idl_r socios...,asco y repugnancia dan estas de idl r socios de,asco y repugnancia dan estas de idl r socios de,0,negativo
7,rosah2022: 𝐷𝐸𝑅𝑅𝐼𝐵𝐴𝑁𝐷𝑂 𝑀𝐼𝑇𝑂𝑆=================es...,rosah estimados ciudadanos les comparto el vid...,rosah estimados ciudadanos les comparto el vid...,1,neutro


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Modelo: DistilBERT-es | Escenario: balanceado | Texto: text_tr
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


tokenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/269M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/distilbert-base-spanish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0176,0.820779,0.635231,0.629468,0.6311,0.635008
2,0.6124,0.776957,0.649466,0.648733,0.648197,0.64942
3,0.5148,0.772329,0.649466,0.649005,0.648854,0.649382



Modelo: DistilBERT-es | Escenario: balanceado | Texto: text_tr_noacc
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/distilbert-base-spanish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0178,0.830742,0.626335,0.616229,0.619815,0.626048
2,0.6048,0.779459,0.626335,0.626147,0.62752,0.626323
3,0.5053,0.778469,0.653025,0.651347,0.650756,0.652947



Modelo: ALBETO Tiny | Escenario: balanceado | Texto: text_tr
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuchile/albert-tiny-spanish and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0841,1.012672,0.569395,0.562085,0.563328,0.569102
2,0.9096,0.908244,0.597865,0.576423,0.602082,0.597698
3,0.8408,0.890416,0.6121,0.59445,0.607681,0.611873



Modelo: ALBETO Tiny | Escenario: balanceado | Texto: text_tr_noacc
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuchile/albert-tiny-spanish and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0846,1.015854,0.581851,0.576789,0.581099,0.581579
2,0.9123,0.912953,0.590747,0.573138,0.593543,0.590596



Modelo: MiniLM Multilingual | Escenario: balanceado | Texto: text_tr
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0958,0.978723,0.537367,0.439979,0.623255,0.536845
2,0.9032,0.911313,0.604982,0.584617,0.60325,0.604743
3,0.8345,0.905684,0.599644,0.575508,0.598358,0.599386



Modelo: MiniLM Multilingual | Escenario: balanceado | Texto: text_tr_noacc
BATCH_SIZE=16 | GRAD_ACC=2 | fp16=True


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
0,1.0943,0.980451,0.530249,0.443342,0.552023,0.529687
2,0.8681,0.900596,0.590747,0.574459,0.586173,0.590539
3,0.8148,0.886883,0.601423,0.583867,0.604784,0.601149



RESULTADOS FINALES (BALANCEADO):
     scenario text_for_model                model  accuracy  f1_macro  \
0  balanceado        text_tr        DistilBERT-es  0.673179  0.671917   
1  balanceado        text_tr  MiniLM Multilingual  0.609236  0.589086   
2  balanceado        text_tr          ALBETO Tiny  0.563055  0.543282   
3  balanceado  text_tr_noacc        DistilBERT-es  0.669627  0.668285   
4  balanceado  text_tr_noacc  MiniLM Multilingual  0.619893  0.607842   
5  balanceado  text_tr_noacc          ALBETO Tiny  0.538188  0.532132   

   precision_macro  recall_macro  auc_macro_ovr status  
0         0.671221      0.673294       0.836341     OK  
1         0.612571      0.609464       0.780437     OK  
2         0.561819      0.563308       0.749330     OK  
3         0.667387      0.669777       0.841893     OK  
4         0.627313      0.620141       0.792472     OK  
5         0.537546      0.538486       0.711719     OK  


Tabla Markdown (guardada): /content/outputs_balancead