# M√©tricas finales usando discriminador y modelos

Este notebook eval√∫a la calidad de los textos generados mediante el clasificador RoBERTa entrenado para detectar estilo Shakespeare.

Se mide el porcentaje de textos clasificados como Shakespeare-like sobre distintos
conjuntos de benchmark:

- textos positivos y negativos de control (de autores similares a Shakespeare)

- textos generados por el modelo base

- textos generados por el modelo fine-tuneado

El objetivo es mostrar cuantitativamente que el modelo fine-tuneado produce textos
m√°s coherentes con el estilo Shakespeare que el modelo base.

## 1. Setup e imports.

In [None]:
!pip install -q torch transformers datasets seaborn tqdm

In [None]:
import os
import re
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## 2. Configuraci√≥n.

In [None]:
BASE_BENCHMARK_DIR = "/content/drive/MyDrive/StoryWriter/Data/Benchmark_data"
BASE_CLASSIFIER_DIR = "/content/drive/MyDrive/StoryWriter/Clasificador"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32
MAX_LENGTH = 512

## 3. Ruta del discriminador.

In [None]:
ruta = os.path.join(BASE_CLASSIFIER_DIR, "Grueso+fino")
contenido = os.listdir(ruta)
checkpoints = [
    d for d in contenido
    if d.startswith("checkpoint-") and os.path.isdir(ruta)
]

# Cargo el √∫ltimo checkpoint
checkpoints.sort(key=lambda x: int(re.search(r"\d+", x).group()))
ruta_grueso_fino = os.path.join(BASE_CLASSIFIER_DIR, checkpoints[-1])
print(f"Usando checkpoint: {checkpoints[-1]}")

RUTAS_MODELOS = {
    "Discriminador (Grueso + Fino)": ruta_grueso_fino
}

## 4. Datasets de benchmark.

In [None]:
TARGET_FOLDERS = [
    "control_positivo",
    "control_negativo_fino",
    "control_negativo_grueso",
    "mistral_base",
    "mistral_base_prompt_pro",
    "mistral_finetune",
    "mistral_finetune_prompt_pro",
]

## 5. Helpers.

In [None]:
def get_txt_files(folder_path):
    if not os.path.exists(folder_path):
        return []
    return [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith(".txt")
    ]

def evaluar_dataset(model, tokenizer, file_paths, device):
    """
    Devuelve el porcentaje de textos clasificados como Shakespeare (label=1).
    """
    if not file_paths:
        return 0.0

    positivos = 0
    total = len(file_paths)

    for i in tqdm(range(0, total, BATCH_SIZE), leave=False):
        batch_files = file_paths[i:i + BATCH_SIZE]
        texts = []

        for fp in batch_files:
            try:
                with open(fp, "r", encoding="utf-8", errors="ignore") as f:
                    texts.append(f.read())
            except:
                continue

        if not texts:
            continue

        inputs = tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1)
            positivos += preds.sum().item()

    return 100 * positivos / total

## 6. Evaluaci√≥n.

In [None]:
all_results = []

for model_name, model_path in RUTAS_MODELOS.items():
    if model_path is None:
        continue

    print(f"\nü§ñ Cargando modelo: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        use_safetensors=True
    ).to(DEVICE)

    model.eval()

    for folder in TARGET_FOLDERS:
        path = os.path.join(BASE_BENCHMARK_DIR, folder)
        files = get_txt_files(path)

        score = evaluar_dataset(model, tokenizer, files, DEVICE) if files else 0.0

        print(f"   {folder:30s}: {score:6.2f}%  ({len(files)} muestras)")

        all_results.append({
            "Dataset": folder,
            "Modelo": model_name,
            "Score_Shakespeare": score
        })

## 7. Resultados.

In [None]:
df = pd.DataFrame(all_results)

pivot = df.pivot(
    index="Dataset",
    columns="Modelo",
    values="Score_Shakespeare"
)

display(pivot)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(
    data=df,
    x="Dataset",
    y="Score_Shakespeare",
    hue="Modelo",
    palette="viridis"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("% Clasificado como Shakespeare")
plt.title("Evaluaci√≥n estil√≠stica con clasificador RoBERTa")
plt.ylim(0, 100)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()