In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

### CARGA Y PREPROCESAMIENTO DE DATOS

In [None]:
import pandas as pd
from datasets import load_dataset


dataset = load_dataset("yaful/MAGE", split="test")

SAMPLE_SIZE =10000  # MAX 64000
df_full = dataset.to_pandas()
df_sample = df_full.sample(n=SAMPLE_SIZE, random_state=42)

df_sample['Clase_Real'] = df_sample['label'].apply(lambda x: 'Humano' if x == 1 else 'IA')

print("\nPrimeras filas del DataFrame de muestra:")
print(df_sample[['text', 'Clase_Real', 'label']].head())


Primeras filas del DataFrame de muestra:
                                                    text Clase_Real  label
21764  Never again...never again!!' This place is ter...         IA      0
46722  put the carpet on the floor, they measure it, ...     Humano      1
49245  [substeps] You may do this process before you ...     Humano      1
30867  I believe mandatory minimum laws are unjust, c...     Humano      1
10010  Wales coach Warren Gatland has hailed Shane Wi...         IA      0


In [33]:
balance = df_sample['Clase_Real'].value_counts(normalize=True) * 100

print("\n--- Balance de Clases en la Muestra ---")
print(balance.to_string())

df_sample['Clase_Real_Binaria'] = df_sample['label']


--- Balance de Clases en la Muestra ---
Clase_Real
IA        50.09
Humano    49.91


In [34]:
df_sample['text_cleaned'] = df_sample['text'].str.replace('\s+', ' ', regex=True).str.strip()

df_sample = df_sample.dropna(subset=['text_cleaned'])

df_sample.reset_index(drop=True, inplace=True)
df_sample['Texto_ID'] = df_sample.index

### CARGA DE MODELOS Y CALCULO DE SCORES

In [None]:
import time
import torch
import psutil

def resource_wrapper(fn, *args, device='cuda', verbose=True):

    if device == 'cuda':
        torch.cuda.reset_peak_memory_stats()
    
    process = psutil.Process(os.getpid())
    mem_before = process.memory_info().rss / 1024**3  # en GB

    start_time = time.time()
    try:
        result = fn(*args)
    except Exception as e:
        if verbose:
            print(f"[ERROR] La función {fn.__name__} falló: {e}")
        raise e
    elapsed = time.time() - start_time

    # VRAM pico (solo si GPU)
    vram_peak = torch.cuda.max_memory_allocated() / 1024**3 if device == 'cuda' else 0.0

    # RAM usada
    mem_after = process.memory_info().rss / 1024**3
    cpu_mem = mem_after - mem_before

    if verbose:
        print(f"[RESOURCE WRAPPER] Función: {fn.__name__}")
        print(f"Tiempo: {elapsed:.2f}s | VRAM Pico: {vram_peak:.2f} GB | RAM usada: {cpu_mem:.2f} GB")

    return result, elapsed, vram_peak, cpu_mem


## SCORES DE LOS MODELOS
$$\text{Score}_{\text{Secuencia}} = - \frac{1}{N} \sum_{i=1}^{N-1} \text{Loss}(\text{predicciones}_i, \text{token real}_{i+1})$$

In [None]:
import torch.nn.functional as F

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForMaskedLM
from tqdm import tqdm


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DTYPE = torch.bfloat16 if DEVICE == 'cuda' else torch.float32

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

LLADA_MODEL_NAME = 'GSAI-ML/LLaDA-8B-Base'
GPT_MODEL_NAME = 'gpt2-large'
LLAMA_MODEL_NAME = "NousResearch/Llama-2-7b-hf"
BERT_MODEL_NAME = "bert-base-uncased"
ROBERTA_MODEL_NAME = "roberta-base"
GPT3_PROXY_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  



MAX_LENGTH = 512 
BATCH_SIZE = 4           

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# a) LLaDA
print("\nCargando LLaDA-8B-Base...")
tokenizer_llada = AutoTokenizer.from_pretrained(LLADA_MODEL_NAME, trust_remote_code=True)

model_llada = AutoModel.from_pretrained(
    LLADA_MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True,
    dtype=DTYPE
).eval()

if hasattr(model_llada, "tie_weights"):
    model_llada.tie_weights()

LLADA_DEVICE = next(model_llada.parameters()).device
print("LLaDA cargado en:", LLADA_DEVICE)

# b) GPT

print("\nCargando GPT-2 Large (Proxy)...")
tokenizer_gpt = AutoTokenizer.from_pretrained(GPT_MODEL_NAME)

if tokenizer_gpt.pad_token is None:
    tokenizer_gpt.pad_token = tokenizer_gpt.eos_token

model_gpt = AutoModelForCausalLM.from_pretrained(
    GPT_MODEL_NAME,
    dtype=DTYPE
).to(DEVICE).eval()

if tokenizer_gpt.pad_token_id >= model_gpt.config.vocab_size:
    model_gpt.resize_token_embeddings(len(tokenizer_gpt))
print("GPT cargado en:", DEVICE)


# c) LLaMA

print("\nCargando LLaMA...")
tokenizer_llama = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME)

model_llama = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_NAME,
    quantization_config=bnb_config,
    dtype=DTYPE
).eval()

LLAMA_DEVICE = next(model_llama.parameters()).device
print("LLaMA cargado en:", LLAMA_DEVICE)



# d) BERT

print("\nCargando BERT...")
tokenizer_bert = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

model_bert = AutoModelForMaskedLM.from_pretrained(
    BERT_MODEL_NAME,
    torch_dtype=DTYPE
).to(DEVICE).eval()


BERT_DEVICE = next(model_bert.parameters()).device
print("BERT cargado en:", BERT_DEVICE)


# e) RoBERTa

print("\nCargando RoBERTa...")
tokenizer_roberta = AutoTokenizer.from_pretrained(ROBERTA_MODEL_NAME)

model_roberta = AutoModelForMaskedLM.from_pretrained(
    ROBERTA_MODEL_NAME,
    torch_dtype=DTYPE
).to(DEVICE).eval()


ROBERTA_DEVICE = next(model_roberta.parameters()).device
print("RoBERTa cargado en:", ROBERTA_DEVICE)


# f) GPT-3

print("\nCargando GPT-3 Proxy (Neo)...")
tokenizer_gpt3 = AutoTokenizer.from_pretrained(GPT3_PROXY_MODEL_NAME)

if tokenizer_gpt3.pad_token is None:
    tokenizer_gpt3.pad_token = tokenizer_gpt3.eos_token

model_gpt3 = AutoModelForCausalLM.from_pretrained(
    GPT3_PROXY_MODEL_NAME,
    quantization_config=bnb_config,
    dtype=DTYPE
).eval()

if tokenizer_gpt3.pad_token_id >= model_gpt3.config.vocab_size:
    model_gpt3.resize_token_embeddings(len(tokenizer_gpt3))

GPT3_DEVICE = next(model_gpt3.parameters()).device
print("GPT-3 Proxy cargado en:", GPT3_DEVICE)


def batch_llada_scores(texts, model, tokenizer, batch_size, device):
    scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].tolist()

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad(), torch.amp.autocast("cuda", dtype=DTYPE):
            outputs = model(**inputs)

            logits = outputs.logits[:, :-1, :]         # (B, T-1, V)
            labels = inputs["input_ids"][:, 1:]        # (B, T-1)

            # Loss por TOKEN
            loss_per_token = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                labels.reshape(-1),
                reduction="none"
            ).view(labels.shape)                       # (B, T-1)

            loss_per_sequence = loss_per_token.mean(dim=1)  # (B,)

            scores.extend((-loss_per_sequence).cpu().tolist())

    return scores

def batch_autoregressive_scores(texts, model, tokenizer, batch_size, device):
    scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].tolist()

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            scores.extend((-loss.detach().cpu()).repeat(len(batch)).tolist())

    return scores


def batch_mlm_scores(texts, model, tokenizer, batch_size, device):
    scores = []
    vocab_size = model.config.vocab_size

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].tolist()

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        input_ids = input_ids.clamp(0, vocab_size - 1)

        with torch.no_grad(), torch.amp.autocast("cuda", dtype=DTYPE):
            outputs = model(**inputs)
            logits = outputs.logits

            token_log_probs = torch.log_softmax(logits, dim=-1)

            ll = token_log_probs.gather(
                2, input_ids.unsqueeze(-1)
            ).squeeze(-1)

            ll = (ll * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)

            scores.extend(ll.cpu().tolist())

    return scores

texts = df_sample['text_cleaned']

print("\nCalculando Score_LLaDA (batch)...")
df_sample['Score_LLaDA'], t_llada, v_llada, ram_llada  = resource_wrapper(
    batch_llada_scores, texts, model_llada, tokenizer_llada, BATCH_SIZE, LLADA_DEVICE
)

print("\nCalculando Score_GPT (batch)...")
df_sample['Score_GPT'], t_gpt2, v_gpt2, ram_gtp2 = resource_wrapper(
    batch_autoregressive_scores, texts, model_gpt, tokenizer_gpt, BATCH_SIZE, DEVICE
)

print("\nCalculando Score_LLaMA (batch)...")
df_sample['Score_LLaMA'], t_llama, v_llama, ram_llama = resource_wrapper(
    batch_autoregressive_scores, texts, model_llama, tokenizer_llama, BATCH_SIZE, LLAMA_DEVICE
)

print("\nCalculando Score_GPT-3 (batch)...")
df_sample['Score_GPT3'], t_gpt3, v_gpt3, ram_gpt3 = resource_wrapper(
    batch_autoregressive_scores, texts, model_gpt3, tokenizer_gpt3, BATCH_SIZE, GPT3_DEVICE
)

print("\nCalculando Score_BERT (batch)...")
df_sample['Score_BERT'], t_bert, v_bert, ram_bert = resource_wrapper(
    batch_mlm_scores, texts, model_bert, tokenizer_bert, BATCH_SIZE, BERT_DEVICE
)

print("\nCalculando Score_RoBERTa (batch)...")
df_sample['Score_RoBERTa'], t_roberta, v_roberta, ram_roberta = resource_wrapper(
    batch_mlm_scores, texts, model_roberta, tokenizer_roberta, BATCH_SIZE, ROBERTA_DEVICE
)

df_final = df_sample.dropna(subset=['Score_LLaDA', 'Score_GPT'], how='all')

print("\n--- Vista Previa de Scores Calculados ---")
print(df_final[['Texto_ID', 'Clase_Real', 'Clase_Real_Binaria', 'Score_LLaDA', 'Score_GPT']].head())

# Liberar memoria
#del model_llada
#del model_gpt
#del model_llama
#del model_bert
#del model_roberta
#del model_gpt3

torch.cuda.empty_cache()

df_final.to_csv('df_final_scores.csv', index=False)



Cargando LLaDA-8B-Base...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

LLaDA cargado en: cuda:0

Cargando GPT-2 Large (Proxy)...
GPT cargado en: cuda

Cargando LLaMA...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LLaMA cargado en: cuda:0

Cargando BERT...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT cargado en: cuda:0

Cargando RoBERTa...
RoBERTa cargado en: cuda:0

Cargando GPT-3 Proxy (Neo)...
GPT-3 Proxy cargado en: cuda:0

Calculando Score_LLaDA (batch)...


100%|██████████| 2500/2500 [26:50<00:00,  1.55it/s]


[RESOURCE WRAPPER] Función: batch_llada_scores
Tiempo: 1610.03s | VRAM Pico: 18.39 GB | RAM usada: 0.01 GB

Calculando Score_GPT (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [06:08<00:00,  6.79it/s]


[RESOURCE WRAPPER] Función: batch_autoregressive_scores
Tiempo: 368.03s | VRAM Pico: 17.85 GB | RAM usada: 0.01 GB

Calculando Score_LLaMA (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [25:11<00:00,  1.65it/s]


[RESOURCE WRAPPER] Función: batch_autoregressive_scores
Tiempo: 1511.09s | VRAM Pico: 18.73 GB | RAM usada: 0.01 GB

Calculando Score_GPT-3 (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [12:55<00:00,  3.22it/s]


[RESOURCE WRAPPER] Función: batch_autoregressive_scores
Tiempo: 775.42s | VRAM Pico: 19.65 GB | RAM usada: 0.01 GB

Calculando Score_BERT (batch)...


100%|██████████| 2500/2500 [01:21<00:00, 30.55it/s]


[RESOURCE WRAPPER] Función: batch_mlm_scores
Tiempo: 81.84s | VRAM Pico: 16.79 GB | RAM usada: 0.01 GB

Calculando Score_RoBERTa (batch)...


100%|██████████| 2500/2500 [01:20<00:00, 31.14it/s]


[RESOURCE WRAPPER] Función: batch_mlm_scores
Tiempo: 80.28s | VRAM Pico: 17.32 GB | RAM usada: 0.01 GB

--- Vista Previa de Scores Calculados ---
   Texto_ID Clase_Real  Clase_Real_Binaria  Score_LLaDA  Score_GPT
0         0         IA                   0    -8.229975  -8.639172
1         1     Humano                   1    -3.688435  -8.639172
2         2     Humano                   1    -7.827047  -8.639172
3         3     Humano                   1   -12.513290  -8.639172
4         4         IA                   0   -12.785653  -5.654218


In [40]:
resource_df = pd.DataFrame({
    "Modelo": ["LLaDA", "GPT-2", "LLaMA", "GPT-3 Proxy", "BERT", "RoBERTa"],
    "Tiempo (s)": [t_llada, t_gpt2, t_llama, t_gpt3, t_bert, t_roberta],
    "VRAM Pico (GB)": [v_llada, v_gpt2, v_llama, v_gpt3, v_bert, v_roberta],
    "RAM Pico (GB)": [ram_llada, ram_gtp2, ram_llama, ram_gpt3, ram_bert, ram_roberta]
})

print(resource_df)

        Modelo   Tiempo (s)  VRAM Pico (GB)  RAM Pico (GB)
0        LLaDA  1610.034923       18.387353       0.007900
1        GPT-2   368.034197       17.845390       0.010357
2        LLaMA  1511.086120       18.729352       0.008083
3  GPT-3 Proxy   775.420411       19.651498       0.008476
4         BERT    81.844312       16.794811       0.005630
5      RoBERTa    80.275546       17.322048       0.009373


### BENCHMARK  Y COMPARACIÓN

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb

import csv
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
RND = 42

df = pd.read_csv(
    "df_final_scores.csv",
    sep=",",
    engine="python",
    on_bad_lines="skip"
)

score_cols = ["Score_LLaDA","Score_GPT","Score_LLaMA","Score_GPT3","Score_BERT","Score_RoBERTa"]

target = "Clase_Real_Binaria"

df[target] = pd.to_numeric(df[target], errors="coerce")

for c in score_cols:
    df[c] = pd.to_numeric(df[c].astype(str).str.replace(',','.'), errors="coerce")

df = df.dropna(subset=[target] + score_cols)

X = df[score_cols]
y = df[target].astype(int)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RND)

scoring = {
    "roc_auc": "roc_auc",
    "accuracy": make_scorer(accuracy_score),
    "f1": make_scorer(f1_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
}

results = []

rf_param_grid = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth": [None, 10, 30],
    "clf__min_samples_leaf": [1, 3],
}

mlp_param_grid = {
    "clf__hidden_layer_sizes": [(50,), (100,), (100,50)],
    "clf__alpha": [1e-4, 1e-3],
    "clf__learning_rate_init": [1e-3, 1e-4],
    "clf__max_iter": [500],
}

results = []

for col in score_cols:
    print(f"=== Procesando columna de score: {col} ===")
    Xi = df[[col]].values

    # LOGISTIC REGRESSION
    lr = LogisticRegression(max_iter=2000)
    cv_res_lr = cross_validate(lr, Xi, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean_lr = {m: np.mean(cv_res_lr[f"test_{m}"]) for m in scoring.keys()}

    # RANDOM FOREST
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=1,
        random_state=RND
    )
    cv_res_rf = cross_validate(rf, Xi, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean_rf = {m: np.mean(cv_res_rf[f"test_{m}"]) for m in scoring.keys()}

    # XGBOOST
    xg = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=RND
    )
    cv_res_xg = cross_validate(xg, Xi, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean_xg = {m: np.mean(cv_res_xg[f"test_{m}"]) for m in scoring.keys()}

    # XGBOOST CALIBRADO (PROBABILIDADES REALES)
    cal = CalibratedClassifierCV(xg, method="isotonic", cv=3)
    cv_res_cal = cross_validate(cal, Xi, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean_cal = {m: np.mean(cv_res_cal[f"test_{m}"]) for m in scoring.keys()}

    results.append({
        "score_col": col,

        "lr_roc_auc": mean_lr["roc_auc"],
        "lr_accuracy": mean_lr["accuracy"],
        "lr_f1": mean_lr["f1"],
        "lr_precision": mean_lr["precision"],
        "lr_recall": mean_lr["recall"],

        "rf_roc_auc": mean_rf["roc_auc"],
        "rf_accuracy": mean_rf["accuracy"],
        "rf_f1": mean_rf["f1"],
        "rf_precision": mean_rf["precision"],
        "rf_recall": mean_rf["recall"],

        "xg_roc_auc": mean_xg["roc_auc"],
        "xg_accuracy": mean_xg["accuracy"],
        "xg_f1": mean_xg["f1"],
        "xg_precision": mean_xg["precision"],
        "xg_recall": mean_xg["recall"],

        "cal_roc_auc": mean_cal["roc_auc"],
        "cal_accuracy": mean_cal["accuracy"],
        "cal_f1": mean_cal["f1"],
        "cal_precision": mean_cal["precision"],
        "cal_recall": mean_cal["recall"],
    })

    print(
        f"LR AUC: {mean_lr['roc_auc']:.4f} | "
        f"RF AUC: {mean_rf['roc_auc']:.4f} | "
        f"XG AUC: {mean_xg['roc_auc']:.4f} | "
        f"CAL AUC: {mean_cal['roc_auc']:.4f}\n"
    )

res_df = pd.DataFrame(results)

print("\n===== COMPARATIVA FINAL =====\n")
print(res_df.sort_values(by="xg_roc_auc", ascending=False).reset_index(drop=True))

res_df.to_csv("model_comparison_by_score_single_feature_ADVANCED.csv", index=False)
print("\nResultados guardados en 'model_comparison_by_score_single_feature_ADVANCED.csv'")

=== Procesando columna de score: Score_LLaDA ===
LR AUC: 0.5403 | RF AUC: 0.5414 | XG AUC: 0.5787 | CAL AUC: 0.5787

=== Procesando columna de score: Score_GPT ===
LR AUC: 0.5161 | RF AUC: 0.4903 | XG AUC: 0.5060 | CAL AUC: 0.5052

=== Procesando columna de score: Score_LLaMA ===
LR AUC: 0.5108 | RF AUC: 0.4883 | XG AUC: 0.4904 | CAL AUC: 0.4994

=== Procesando columna de score: Score_GPT3 ===
LR AUC: 0.5145 | RF AUC: 0.4906 | XG AUC: 0.5004 | CAL AUC: 0.4960

=== Procesando columna de score: Score_BERT ===
LR AUC: 0.5314 | RF AUC: 0.5231 | XG AUC: 0.5489 | CAL AUC: 0.5445

=== Procesando columna de score: Score_RoBERTa ===
LR AUC: 0.6004 | RF AUC: 0.5320 | XG AUC: 0.5986 | CAL AUC: 0.6004


===== COMPARATIVA FINAL =====

       score_col  lr_roc_auc  lr_accuracy     lr_f1  lr_precision  lr_recall  \
0  Score_RoBERTa    0.600436       0.5823  0.504096      0.618496   0.425565   
1    Score_LLaDA    0.540325       0.5340  0.518245      0.535319   0.502309   
2     Score_BERT    0.531444

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
score_cols = ["Score_LLaDA","Score_GPT","Score_LLaMA","Score_GPT3","Score_BERT","Score_RoBERTa"]
y = df["Clase_Real_Binaria"].values

results = []

for col in score_cols:
    print(f"\n==============================")
    print(f"Entrenando Deep MLP para: {col}")
    print(f"==============================")

    X = df[[col]].values   

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = Sequential([
        Dense(128, activation="relu", input_shape=(1,)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(64, activation="relu"),
        BatchNormalization(),
        Dropout(0.3),

        Dense(32, activation="relu"),
        BatchNormalization(),
        Dropout(0.2),

        Dense(16, activation="relu"),
        Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer=Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    early_stop = EarlyStopping(
        monitor="val_loss",
        patience=20,
        restore_best_weights=True
    )

    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=300,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )

    y_prob = model.predict(X_test).ravel()
    y_pred = (y_prob >= 0.5).astype(int)

    roc = roc_auc_score(y_test, y_prob)

    print(f"ROC AUC ({col}): {roc:.4f}")
    print(classification_report(y_test, y_pred))

    model_name = f"deep_mlp_{col}.h5"
    model.save(model_name)

    results.append({
        "score_col": col,
        "roc_auc": roc,
        "accuracy": np.mean(y_pred == y_test),
        "model_path": model_name
    })


res_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False).reset_index(drop=True)

print("\n===== COMPARATIVA FINAL DEEP MLP POR SCORE =====\n")
print(res_df)

res_df.to_csv("deep_mlp_by_score_comparison.csv", index=False)
print("\nResultados guardados en 'deep_mlp_by_score_comparison.csv'")


2025-12-10 10:20:26.105566: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-10 10:20:26.204711: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-10 10:20:27.722116: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.



Entrenando Deep MLP para: Score_LLaDA


I0000 00:00:1765362028.050794    1154 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 818 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:15:00.0, compute capability: 8.6
2025-12-10 10:20:31.735749: I external/local_xla/xla/service/service.cc:163] XLA service 0x7faf94010b70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-10 10:20:31.735778: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA A40, Compute Capability 8.6
2025-12-10 10:20:31.816266: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-12-10 10:20:32.333848: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
2025-12-10 10:20:32.629277: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step




ROC AUC (Score_LLaDA): 0.5847
              precision    recall  f1-score   support

           0       0.59      0.41      0.48      1002
           1       0.55      0.72      0.62       998

    accuracy                           0.56      2000
   macro avg       0.57      0.56      0.55      2000
weighted avg       0.57      0.56      0.55      2000


Entrenando Deep MLP para: Score_GPT
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step  




ROC AUC (Score_GPT): 0.5030
              precision    recall  f1-score   support

           0       0.49      0.23      0.31      1002
           1       0.50      0.77      0.60       998

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.46      2000
weighted avg       0.50      0.50      0.46      2000


Entrenando Deep MLP para: Score_LLaMA
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step  




ROC AUC (Score_LLaMA): 0.5039
              precision    recall  f1-score   support

           0       0.51      0.25      0.33      1002
           1       0.50      0.76      0.60       998

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.47      2000
weighted avg       0.50      0.50      0.47      2000


Entrenando Deep MLP para: Score_GPT3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step  




ROC AUC (Score_GPT3): 0.5340
              precision    recall  f1-score   support

           0       0.52      0.52      0.52      1002
           1       0.52      0.52      0.52       998

    accuracy                           0.52      2000
   macro avg       0.52      0.52      0.52      2000
weighted avg       0.52      0.52      0.52      2000


Entrenando Deep MLP para: Score_BERT
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step  




ROC AUC (Score_BERT): 0.5621
              precision    recall  f1-score   support

           0       0.54      0.52      0.53      1002
           1       0.53      0.55      0.54       998

    accuracy                           0.54      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.54      0.54      0.54      2000


Entrenando Deep MLP para: Score_RoBERTa
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step  




ROC AUC (Score_RoBERTa): 0.6109
              precision    recall  f1-score   support

           0       0.59      0.64      0.61      1002
           1       0.60      0.55      0.57       998

    accuracy                           0.59      2000
   macro avg       0.60      0.59      0.59      2000
weighted avg       0.60      0.59      0.59      2000


===== COMPARATIVA FINAL DEEP MLP POR SCORE =====

       score_col   roc_auc  accuracy                 model_path
0  Score_RoBERTa  0.610874    0.5945  deep_mlp_Score_RoBERTa.h5
1    Score_LLaDA  0.584680    0.5605    deep_mlp_Score_LLaDA.h5
2     Score_BERT  0.562085    0.5365     deep_mlp_Score_BERT.h5
3     Score_GPT3  0.533958    0.5205     deep_mlp_Score_GPT3.h5
4    Score_LLaMA  0.503905    0.5025    deep_mlp_Score_LLaMA.h5
5      Score_GPT  0.503026    0.4965      deep_mlp_Score_GPT.h5

Resultados guardados en 'deep_mlp_by_score_comparison.csv'


## Metricas del PAWN


### Fórmulas de las 5 Métricas de PAWN 

A continuación, $P_{i,j}$ representa la probabilidad asignada al *token* $j$ en la posición $i$ de la secuencia, $t_{i+1}$ es el *token* que ocurrió realmente en la posición $i+1$, $N$ es la longitud de la secuencia y $V$ es el tamaño del vocabulario.

### 1. Log-Probability del Token Ocurrido
Esta es la Log-Probabilidad asignada por el modelo al *token* real que aparece después de la posición $i$.

$$M^{\text{log-prob}}_i = \log P_{i,t_{i+1}}$$

### 2. Entropía de la Distribución
Esta métrica mide la aleatoriedad (incertidumbre) de la predicción en la posición $i$.

$$M^{\text{entropy}}_i = -\sum_{j=1}^{V} P_{i,j} \log P_{i,j}$$

### 3. Log-Probability Máxima
Esta métrica mide la Log-Probabilidad del *token* más probable en el vocabulario en la posición $i$.

$$M^{\text{max-log-prob}}_i = \max_{j=1,...,V} \log P_{i,j}$$

### 4. Rango del Token (Cuantil Normalizado)
Esta métrica indica la posición ordenada del *token* real $t_{i+1}$ dentro de todas las opciones, normalizada por el tamaño del vocabulario $V$.

$$M^{\text{rank}}_i = \frac{\text{rank}(\log P_{i,:,t_{i+1}})}{V}$$

### 5. Suma de Probabilidades (Top-P Proxy)
Esta métrica es la suma de las probabilidades de todos los *tokens* que son tan probables o más probables que el *token* real $t_{i+1}$.

$$M^{\text{top-p}}_i = \sum_{j=1,...,V ; P_{i,j}\geq P_{i,t_{i+1}}} P_{i,j}$$

In [None]:
def calculate_five_metrics(logits, labels, attention_mask):
    """
    Calcula las 5 métricas del paper a partir de logits y el siguiente token real (labels).

    :param logits: Tensor de logits del modelo (B, T-1, V)
    :param labels: Tensor de los tokens reales siguientes (B, T-1)
    :param attention_mask: Tensor de la máscara de atención (B, T-1)
    :return: 5 listas de métricas (una por cada métrica), promediadas por secuencia.
    """
    B, T_minus_1, V = logits.shape

    log_probs = F.log_softmax(logits, dim=-1) # (B, T-1, V)
    probs = torch.exp(log_probs)             # (B, T-1, V)

    
    log_prob_occured = log_probs.gather(
        2, labels.unsqueeze(-1)
    ).squeeze(-1) # (B, T-1)
    
    Mlog_prob = log_prob_occured

    entropy = - (probs * log_probs).sum(dim=-1) # (B, T-1)
    
    Mentropy = entropy
    
    Mmax_log_prob, _ = log_probs.max(dim=-1) # (B, T-1)
    

    log_prob_occured_val = log_prob_occured.unsqueeze(-1) # (B, T-1, 1)

    rank_mask = (log_probs >= log_prob_occured_val) 
    
    Mrank = rank_mask.sum(dim=-1).float() / V # (B, T-1)

    prob_occured = probs.gather(
        2, labels.unsqueeze(-1)
    ).squeeze(-1).unsqueeze(-1) # (B, T-1, 1)

    top_p_mask = (probs >= prob_occured) # (B, T-1, V)

    Mtop_p = (probs * top_p_mask).sum(dim=-1) # (B, T-1)
    
    masked_metrics = [Mlog_prob, Mentropy, Mmax_log_prob, Mrank, Mtop_p]
    
    results = []
    sequence_lengths = attention_mask.sum(dim=1).float() # (B,)
    
    for M in masked_metrics:
        # M: (B, T-1)
        M_masked = M * attention_mask
        M_sum_per_seq = M_masked.sum(dim=1) # (B,)
        
        # Promedio: Suma / Longitud
        M_avg_per_seq = M_sum_per_seq / sequence_lengths
        results.append(M_avg_per_seq.cpu().tolist())
        
    return results

In [None]:
def batch_autoregressive_metrics(texts, model, tokenizer, batch_size, device):
    """
    Calcula las 5 métricas para modelos autoregresivos (LLaDA, GPT, LLaMA, GPT-3)
    """
    all_metrics = [[], [], [], [], []]
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].tolist()

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        inputs_on_device = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
            outputs = model(**inputs_on_device)
            
            logits = outputs.logits[:, :-1, :]        # (B, T-1, V)
            labels = inputs_on_device["input_ids"][:, 1:] # (B, T-1)

            attention_mask = inputs_on_device["attention_mask"][:, 1:] # (B, T-1)
            
            metrics = calculate_five_metrics(logits, labels, attention_mask)
            
            for j in range(5):
                all_metrics[j].extend(metrics[j])
                
    return all_metrics


In [45]:
metrics_names = ['Mlog_prob', 'Mentropy', 'Mmax_log_prob', 'Mrank', 'Mtop_p']

# 1. LLaDA
print("\nCalculando 5 Métricas_LLaDA (batch)...")
results_llada = batch_autoregressive_metrics(texts, model_llada, tokenizer_llada, BATCH_SIZE, LLADA_DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_LLaDA'] = results_llada[i]


# 2. GPT
print("\nCalculando 5 Métricas_GPT (batch)...")
results_gpt = batch_autoregressive_metrics(texts, model_gpt, tokenizer_gpt, BATCH_SIZE, DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_GPT'] = results_gpt[i]

# 3. LLaMA
print("\nCalculando 5 Métricas_LLaMA (batch)...")
results_llama = batch_autoregressive_metrics(texts, model_llama, tokenizer_llama, BATCH_SIZE, LLAMA_DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_LLaMA'] = results_llama[i]

# 4. GPT-3 Proxy
print("\nCalculando 5 Métricas_GPT-3 (batch)...")
results_gpt3 = batch_autoregressive_metrics(texts, model_gpt3, tokenizer_gpt3, BATCH_SIZE, GPT3_DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_GPT3'] = results_gpt3[i]

# 5. BERT
print("\nCalculando 5 Métricas_BERT (batch)...")
results_bert = batch_autoregressive_metrics(texts, model_bert, tokenizer_bert, BATCH_SIZE, BERT_DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_BERT'] = results_bert[i]

# 6. RoBERTa
print("\nCalculando 5 Métricas_RoBERTa (batch)...")
results_roberta = batch_autoregressive_metrics(texts, model_roberta, tokenizer_roberta, BATCH_SIZE, ROBERTA_DEVICE)

for i, metric_name in enumerate(metrics_names):
    df_sample[f'{metric_name}_RoBERTa'] = results_roberta[i]


df_final = df_sample.dropna(subset=['Mlog_prob_LLaDA', 'Mlog_prob_GPT'], how='all')

print("\n--- Vista Previa de Scores Calculados ---")
print(df_final[['Texto_ID', 'Clase_Real', 'Mlog_prob_LLaDA', 'Mentropy_LLaDA', 'Mlog_prob_GPT', 'Mentropy_GPT']].head())

df_final.to_csv('df_final_metrics.csv', index=False)


Calculando 5 Métricas_LLaDA (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [28:04<00:00,  1.48it/s]



Calculando 5 Métricas_GPT (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [06:39<00:00,  6.26it/s]



Calculando 5 Métricas_LLaMA (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [25:02<00:00,  1.66it/s]



Calculando 5 Métricas_GPT-3 (batch)...


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=DTYPE):
100%|██████████| 2500/2500 [13:35<00:00,  3.07it/s]



--- Vista Previa de Scores Calculados ---
   Texto_ID Clase_Real  Mlog_prob_LLaDA  Mentropy_LLaDA  Mlog_prob_GPT  \
0         0         IA        -8.698508        1.945712      -3.582509   
1         1     Humano        -8.881931        1.905178      -3.740482   
2         2     Humano       -11.092595        0.685308      -3.754062   
3         3     Humano       -12.512227        0.530346      -2.835763   
4         4         IA       -14.274117        0.359901      -1.785728   

   Mentropy_GPT  
0      3.662330  
1      4.063519  
2      3.667406  
3      2.819254  
4      2.196720  


In [None]:
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
BATCH_SIZE = 64
EPOCHS = 200
PATIENCE = 20
LR = 1e-4
WEIGHT_DECAY = 1e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_DIR = "models_mlps"
os.makedirs(MODEL_DIR, exist_ok=True)

TARGET_COL = "Clase_Real_Binaria"  
metrics_names = ['Mlog_prob', 'Mentropy', 'Mmax_log_prob', 'Mrank', 'Mtop_p']
models_list = ['LLaDA', 'GPT', 'LLaMA', 'GPT3', 'BERT', 'RoBERTa']  

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)

class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), torch.tensor(self.y[idx]).long()

class DeepMLP(nn.Module):
    def __init__(self, input_dim,
                 hidden_dims=(512, 256, 128, 64),
                 dropout=0.4,
                 batchnorm=True,
                 activation=nn.ReLU,
                 final_dropout=0.25):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            if batchnorm:
                layers.append(nn.BatchNorm1d(h))
            layers.append(activation())
            layers.append(nn.Dropout(dropout))
            prev = h
        # final classifier
        layers.append(nn.Linear(prev, 2))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

def compute_metrics(y_true, y_pred_probs):
    y_pred = (y_pred_probs[:,1] >= 0.5).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_pred_probs[:,1])
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc}

df = pd.read_csv(
    "df_final_metrics.csv",
    sep=",",
    engine="python",
    on_bad_lines="skip"
)
if TARGET_COL not in df.columns:
    raise RuntimeError(f"No encuentro la columna objetivo '{TARGET_COL}' en df_final_metrics.csv. Cambia TARGET_COL al nombre correcto.")

results = {}

for model_name in models_list:
    cols = [f"{m}_{model_name}" for m in metrics_names]
    missing = [c for c in cols if c not in df.columns]
    if missing:
        print(f"[WARN] Columns missing for {model_name}: {missing}. Saltando este modelo.")
        continue

    subset = df[cols + [TARGET_COL]].dropna()
    X = subset[cols].values
    y = subset[TARGET_COL].values.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    train_ds = TabularDataset(X_train, y_train)
    test_ds = TabularDataset(X_test, y_test)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    classes, counts = np.unique(y_train, return_counts=True)
    if len(counts) == 1:
        class_weights = torch.tensor([1.0, 1.0], dtype=torch.float32, device=DEVICE)
    else:
        inv = 1.0 / counts
        weights = inv / inv.sum()
        cw = np.zeros(2, dtype=np.float32)
        for cls, w in zip(classes, weights):
            cw[int(cls)] = w
        class_weights = torch.tensor(cw, dtype=torch.float32, device=DEVICE)

    input_dim = X_train.shape[1]
    hidden_dims = (1024, 512, 256, 128)   # deep y ancho
    model = DeepMLP(input_dim, hidden_dims=hidden_dims, dropout=0.4, batchnorm=True).to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    best_auc = -1.0
    best_state = None
    epochs_no_improve = 0
    history = {"train_loss": [], "test_auc": []}

    for epoch in range(EPOCHS):
        model.train()
        loss_epoch = 0.0
        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            loss_epoch += loss.item() * xb.size(0)
        loss_epoch /= len(train_loader.dataset)
        history["train_loss"].append(loss_epoch)

        model.eval()
        ys = []
        yprobs = []
        with torch.no_grad():
            for xb, yb in test_loader:
                xb = xb.to(DEVICE)
                logits = model(xb)
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                yprobs.append(probs)
                ys.append(yb.numpy())
        yprobs = np.vstack(yprobs)
        ys = np.concatenate(ys)

        metrics_eval = compute_metrics(ys, yprobs)
        history["test_auc"].append(metrics_eval["roc_auc"])

        # early stopping
        if np.isfinite(metrics_eval["roc_auc"]) and metrics_eval["roc_auc"] > best_auc:
            best_auc = metrics_eval["roc_auc"]
            best_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epoch % 5 == 0 or epoch == EPOCHS-1:
            print(f"[{model_name}] Epoch {epoch+1}/{EPOCHS} loss={loss_epoch:.4f} test_auc={metrics_eval['roc_auc']:.4f} f1={metrics_eval['f1']:.4f}")

        if epochs_no_improve >= PATIENCE:
            print(f"[{model_name}] Early stopping en epoch {epoch+1}. Mejor AUC val: {best_auc:.4f}")
            break

    if best_state:
        model.load_state_dict(best_state)

    model.eval()
    ys = []
    yprobs = []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(DEVICE)
            logits = model(xb)
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            yprobs.append(probs)
            ys.append(yb.numpy())
    yprobs = np.vstack(yprobs)
    ys = np.concatenate(ys)
    final_metrics = compute_metrics(ys, yprobs)
    print(f"\n>>> RESULTADOS FINALES para {model_name}: {final_metrics}\n")

    torch.save({
        "model_state": model.state_dict(),
        "scaler_mean": scaler.mean_,
        "scaler_scale": scaler.scale_,
        "cols": cols,
        "metrics": final_metrics,
        "history": history
    }, os.path.join(MODEL_DIR, f"mlp_{model_name}.pt"))

    results[model_name] = final_metrics

df_results = pd.DataFrame(results).T
print("===== Resumen comparativo =====")
print(df_results)
best_model = df_results['roc_auc'].idxmax()
print(f"\nMejor modelo según ROC-AUC: {best_model} -> {df_results.loc[best_model].to_dict()}")

# guardado de resultados
df_results.to_csv(os.path.join(MODEL_DIR, "comparison_results.csv"))
print(f"Modelos y resultados guardados en '{MODEL_DIR}/'")

[LLaDA] Epoch 1/200 loss=0.6781 test_auc=0.7030 f1=0.6879
[LLaDA] Epoch 6/200 loss=0.6314 test_auc=0.7152 f1=0.7004
[LLaDA] Epoch 11/200 loss=0.6209 test_auc=0.7196 f1=0.7025
[LLaDA] Epoch 16/200 loss=0.6213 test_auc=0.7197 f1=0.6983
[LLaDA] Epoch 21/200 loss=0.6159 test_auc=0.7225 f1=0.6982
[LLaDA] Epoch 26/200 loss=0.6175 test_auc=0.7198 f1=0.7055
[LLaDA] Epoch 31/200 loss=0.6142 test_auc=0.7226 f1=0.7038
[LLaDA] Epoch 36/200 loss=0.6152 test_auc=0.7227 f1=0.7049
[LLaDA] Epoch 41/200 loss=0.6123 test_auc=0.7217 f1=0.6951
[LLaDA] Epoch 46/200 loss=0.6116 test_auc=0.7213 f1=0.7009
[LLaDA] Epoch 51/200 loss=0.6100 test_auc=0.7232 f1=0.7024
[LLaDA] Epoch 56/200 loss=0.6109 test_auc=0.7240 f1=0.7068
[LLaDA] Epoch 61/200 loss=0.6125 test_auc=0.7241 f1=0.6962
[LLaDA] Epoch 66/200 loss=0.6056 test_auc=0.7248 f1=0.7005
[LLaDA] Epoch 71/200 loss=0.6071 test_auc=0.7230 f1=0.7010
[LLaDA] Epoch 76/200 loss=0.6043 test_auc=0.7237 f1=0.6978
[LLaDA] Epoch 81/200 loss=0.6029 test_auc=0.7252 f1=0.6965