In [None]:
# ==========================================
# SETUP COLAB (Carga desde Drive)
# ==========================================
import sys
import os

try:
    import google.colab
    IN_COLAB = True
    print("üöÄ En Colab: Montando Drive...")

    from google.colab import drive
    drive.mount('/content/drive')

    # --- CONFIGURACI√ìN ---
    # Ruta a tu carpeta en Drive
    DRIVE_BASE = "/content/drive/MyDrive/UCOM/proyecto_final/nlp_mental_health/antigravity"
    # ---------------------

    # Instalar librer√≠as
    !pip install -q transformers datasets accelerate scikit-learn seaborn matplotlib unidecode

    # Preparar entorno local en Colab
    !mkdir -p data/splits
    !mkdir -p notebooks

    # Copiar datos y utils desde Drive al entorno local de Colab (m√°s r√°pido que leer de Drive)
    print("‚è≥ Copiando archivos...")
    !cp -r "$DRIVE_BASE/data/splits/"* data/splits/
    !cp "$DRIVE_BASE/notebooks/utils_shared.py" notebooks/

    # Agregar carpeta notebooks al path para poder importar utils_shared
    sys.path.append('/content/notebooks')

    print("‚úÖ Setup listo. Archivos copiados.")

except ImportError:
    IN_COLAB = False
    print("üíª En Local: Setup omitido.")

# 04_baseline_transformers ‚Äî BETO & RoBERTa

**Objetivo:** Implementar baselines con Transformers pre-entrenados en espa√±ol.

**Modelos:**
1. **BETO** (dccuchile/bert-base-spanish-wwm-cased): General.
2. **RoBERTa Biom√©dico** (PlanTL-GOB-ES/roberta-base-biomedical-es): Biom√©dico.
3. **RoBERTa Cl√≠nico** (PlanTL-GOB-ES/roberta-base-biomedical-clinical-es): Cl√≠nico.

**Estrategia:**
- **Train:** `train_denoised.csv` (Se√±al pura, 814 casos).
- **Dev:** `dev_full.csv` (Realista, 641 casos).
- **Max Length:** 512 tokens (Cr√≠tico: ~32% de textos exceden 256).

**Optimizado para:**
- ‚úÖ Apple Silicon (M2/M3) con MPS
- ‚úÖ NVIDIA GPU con CUDA
- ‚úÖ CPU (fallback)

**Exportables:**
- `data/{model_name}_eval.csv`
- `data/{model_name}_classification_report.csv`

In [None]:
# ===============================================================
# Setup: Imports y configuraci√≥n de paths
# ===============================================================
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Importar utilidades compartidas
try:
    from utils_shared import setup_paths, load_splits, calculate_metrics, get_cv_splitter
    paths = setup_paths()
    DATA_PATH = paths['DATA_PATH']
    SPLITS_PATH = paths['SPLITS_PATH']
    print("[OK] Usando utils_shared.py")
except ImportError:
    print("[ERROR] No se encontr√≥ utils_shared.py. Verifica que est√°s en el directorio correcto.")
    raise

# Configuraci√≥n de dispositivo (GPU/MPS/CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    use_mps_device = False
    print(f"üöÄ Usando GPU: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    use_mps_device = True
    print("üöÄ Usando Apple Silicon (MPS)")
else:
    device = torch.device("cpu")
    use_mps_device = False
    print("‚ö†Ô∏è  Usando CPU (lento)")

# Hiperpar√°metros
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3

print(f"\n‚öôÔ∏è  Configuraci√≥n:")
print(f"   Max Length: {MAX_LENGTH}")
print(f"   Batch Size: {BATCH_SIZE}")
print(f"   Epochs: {EPOCHS}")

## 1) Carga de Datos y Preprocesamiento

In [None]:
# Cargar datasets
try:
    # Train: Usar train_denoised (se√±al cl√≠nica) desde SPLITS_PATH
    df_train = pd.read_csv(SPLITS_PATH / 'train_denoised.csv')

    # Dev: Construir desde splits (dataset completo)
    df_base, _, dev_idx, _ = load_splits(SPLITS_PATH)
    df_dev = df_base.set_index('row_id').loc[dev_idx].reset_index()

    print(f"‚úÖ Train (Denoised): {len(df_train)} casos")
    print(f"‚úÖ Dev (Full): {len(df_dev)} casos")
except FileNotFoundError:
    print("[ERROR] No se encontraron los datasets. Ejecuta 03_rule_based_denoising.ipynb primero.")
    raise

# Mapeo de etiquetas
label2id = {'depresion': 0, 'ansiedad': 1}
id2label = {0: 'depresion', 1: 'ansiedad'}

df_train['label'] = df_train['etiqueta'].map(label2id)
df_dev['label'] = df_dev['etiqueta'].map(label2id)

print(f"\nüìä Distribuci√≥n Train: {dict(df_train['etiqueta'].value_counts())}")
print(f"üìä Distribuci√≥n Dev: {dict(df_dev['etiqueta'].value_counts())}")

# Limpieza conservadora (Transformers manejan bien el texto crudo)
RE_MULTI = re.compile(r'(.)\1{2,}')

def clean_text_trf(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFC", s)
    s = RE_MULTI.sub(r'\1\1', s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

print("\nüßπ Limpiando textos...")
df_train['texto_trf'] = df_train['texto'].map(clean_text_trf)
df_dev['texto_trf'] = df_dev['texto'].map(clean_text_trf)

# Convertir a HuggingFace Datasets
ds_train = Dataset.from_pandas(df_train[['texto_trf', 'label']].rename(columns={'texto_trf': 'texto'}))
ds_dev = Dataset.from_pandas(df_dev[['texto_trf', 'label']].rename(columns={'texto_trf': 'texto'}))

print("‚úÖ Datasets preparados\n")

## 2) Entrenamiento y Evaluaci√≥n (Loop Modelos)

In [None]:
MODELS = {
    "beto": "dccuchile/bert-base-spanish-wwm-cased",
    "roberta_biomedical": "PlanTL-GOB-ES/roberta-base-biomedical-es",
    "roberta_clinical": "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"
}

def compute_metrics(eval_pred):
    """Calcula m√©tricas usando utils_shared"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Convertir IDs a etiquetas string
    labels_str = [id2label[l] for l in labels]
    preds_str = [id2label[p] for p in predictions]

    # Usar funci√≥n compartida
    metrics = calculate_metrics(labels_str, preds_str)

    # Trainer espera flat dict con nombres espec√≠ficos
    return {
        'f1': metrics['f1_macro'],
        'precision': metrics['precision_macro'],
        'recall': metrics['recall_macro'],
        'accuracy': metrics['accuracy']
    }

In [None]:
# ===============================================================
# Loop de Entrenamiento
# ===============================================================
print(f"\n{'='*60}")
print(f"üöÄ INICIANDO ENTRENAMIENTO DE {len(MODELS)} MODELOS")
print(f"{'='*60}\n")

for i, (model_name, model_id) in enumerate(MODELS.items(), 1):
    print(f"\n{'='*60}")
    print(f"üì¶ [{i}/{len(MODELS)}] MODELO: {model_name}")
    print(f"üîó HuggingFace ID: {model_id}")
    print(f"{'='*60}\n")

    # Tokenizer
    print("‚è≥ Cargando tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    def tokenize_function(examples):
        return tokenizer(
            examples["texto"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH
        )

    print("‚è≥ Tokenizando datasets...")
    tokenized_train = ds_train.map(tokenize_function, batched=True)
    tokenized_dev = ds_dev.map(tokenize_function, batched=True)

    # Model
    print("‚è≥ Cargando modelo...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    )

    # Mover modelo al dispositivo
    model = model.to(device)

    # Training Args
    training_args = TrainingArguments(
        output_dir=str(DATA_PATH / "checkpoints" / model_name),
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=str(DATA_PATH / "logs" / model_name),
        logging_steps=10,
        seed=42,
        use_mps_device=use_mps_device,  # Activar MPS si est√° disponible
        report_to="none"  # Desactivar wandb/tensorboard
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train
    print(f"\nüèãÔ∏è  Entrenando {model_name}...")
    print(f"   Epochs: {EPOCHS} | Batch Size: {BATCH_SIZE} | Max Length: {MAX_LENGTH}")
    print(f"   Dispositivo: {device}\n")

    trainer.train()

    # Evaluate Final
    print(f"\nüìä Evaluando {model_name} en Dev Set...")
    eval_results = trainer.evaluate()

    print(f"\n{'='*60}")
    print(f"‚úÖ RESULTADOS {model_name.upper()}")
    print(f"{'='*60}")
    print(f"  F1 Macro:     {eval_results['eval_f1']:.4f}")
    print(f"  Precision:    {eval_results['eval_precision']:.4f}")
    print(f"  Recall:       {eval_results['eval_recall']:.4f}")
    print(f"  Accuracy:     {eval_results['eval_accuracy']:.4f}")
    print(f"{'='*60}\n")

    # Exportar m√©tricas
    metrics_df = pd.DataFrame([{
        'modelo': model_name,
        'f1_macro': eval_results['eval_f1'],
        'precision_macro': eval_results['eval_precision'],
        'recall_macro': eval_results['eval_recall'],
        'accuracy': eval_results['eval_accuracy'],
        'n_train': len(ds_train),
        'n_dev': len(ds_dev),
        'epochs': EPOCHS,
        'batch_size': BATCH_SIZE,
        'max_length': MAX_LENGTH,
        'device': str(device)
    }])

    output_path = DATA_PATH / f'{model_name}_eval.csv'
    metrics_df.to_csv(output_path, index=False)
    print(f"üíæ Guardado: {output_path}")

    # Limpiar memoria
    print(f"üßπ Liberando memoria...\n")
    del model, trainer, tokenized_train, tokenized_dev

    if device.type == "cuda":
        torch.cuda.empty_cache()
    elif device.type == "mps":
        torch.mps.empty_cache()

print(f"\n{'='*60}")
print(f"üéâ ENTRENAMIENTO COMPLETADO")
print(f"{'='*60}")
print(f"‚úÖ {len(MODELS)} modelos entrenados y evaluados")
print(f"üìÅ Resultados guardados en: {DATA_PATH}")
print(f"\nüí° Pr√≥ximo paso: Ejecuta 05_comparacion_resultados.ipynb")

## 3) Cross-Validation (5-Fold)

**Advertencia:** Esto puede tomar considerablemente m√°s tiempo que el entrenamiento simple.

In [None]:
from utils_shared import get_cv_splitter
from transformers import Trainer, TrainingArguments

# Combinar Train + Dev
df_full = pd.concat([df_train, df_dev]).reset_index(drop=True)

# DEBUG: Verificar etiquetas
print(f"Etiquetas √∫nicas en df_full: {df_full['label'].unique()}")
print(f"Tipos de datos en label: {df_full['label'].dtype}")
if df_full['label'].isnull().any():
    print("‚ö†Ô∏è ADVERTENCIA: Hay valores nulos en la columna label!")
    df_full = df_full.dropna(subset=['label'])
    print("‚úÖ Filas con label nulo eliminadas.")

# Asegurar que sean enteros
df_full['label'] = df_full['label'].astype(int)

groups_full = df_full['patient_id']  # Usar patient_id directamente

cv = get_cv_splitter(n_splits=5)
cv_results = []

print("Iniciando Cross-Validation (BETO)...")

# Load BETO tokenizer once for CV
model_id_cv = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer_cv = AutoTokenizer.from_pretrained(model_id_cv)

# Funci√≥n auxiliar para tokenizar
def tokenize_function_cv(examples):
    return tokenizer_cv(
        examples["texto"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH # Use global MAX_LENGTH
    )

for fold, (train_idx, val_idx) in enumerate(cv.split(df_full, df_full['etiqueta'], groups_full)):
    print(f"\n--- Fold {fold+1} ---")

    # Split datos
    train_fold = df_full.iloc[train_idx]
    val_fold = df_full.iloc[val_idx]

    # Crear datasets HF (Usando texto limpio 'texto_trf' renombrado a 'texto')
    ds_train_fold = Dataset.from_pandas(train_fold[['texto_trf', 'label']].rename(columns={'texto_trf': 'texto'}))
    ds_val_fold = Dataset.from_pandas(val_fold[['texto_trf', 'label']].rename(columns={'texto_trf': 'texto'}))

    # Tokenizar fold-specific datasets
    tokenized_train_fold = ds_train_fold.map(tokenize_function_cv, batched=True)
    tokenized_val_fold = ds_val_fold.map(tokenize_function_cv, batched=True)

    # Usamos BETO expl√≠citamente como indica el print anterior
    model_cv = AutoModelForSequenceClassification.from_pretrained(
        model_id_cv,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    ).to("cpu")

    training_args_cv = TrainingArguments(
        no_cuda=True, # DEBUG: Force CPU
        output_dir=f"./results_cv/fold_{fold}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE, # Use global BATCH_SIZE
        per_device_eval_batch_size=BATCH_SIZE, # Use global BATCH_SIZE
        num_train_epochs=EPOCHS, # Use global EPOCHS
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f'./logs_cv/fold_{fold}',
        logging_steps=10,
        seed=42,
        report_to="none", # Disable wandb logging
        use_mps_device=use_mps_device, # Activate MPS if available
    )

    trainer_cv = Trainer(
        model=model_cv,
        args=training_args_cv,
        train_dataset=tokenized_train_fold,
        eval_dataset=tokenized_val_fold,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer_cv # Pass the correct tokenizer for the Trainer
    )

    trainer_cv.train()

    # Evaluar
    eval_result = trainer_cv.evaluate()

    cv_results.append({
        'fold': fold + 1,
        'model': 'BETO',
        'f1_macro': eval_result['eval_f1'],
        'precision_macro': eval_result['eval_precision'],
        'recall_macro': eval_result['eval_recall']
    })
    print(f"Fold {fold+1}: F1={eval_result['eval_f1']:.4f}")

    # Clean up memory after each fold
    print(f"üßπ Liberando memoria para Fold {fold+1}...")
    del model_cv, trainer_cv, tokenized_train_fold, tokenized_val_fold
    if device.type == "cuda":
        torch.cuda.empty_cache()
    elif device.type == "mps":
        torch.mps.empty_cache()

df_cv = pd.DataFrame(cv_results)
print("\nPromedio CV:")
print(df_cv.mean(numeric_only=True))

out_path = DATA_PATH / 'beto_cv_results.csv'
df_cv.to_csv(out_path, index=False)
print(f"\u2713 Exportado: {out_path}")

In [None]:
if 'IN_COLAB' in locals() and IN_COLAB:
    print("‚è≥ Guardando resultados en Drive...")
    # Asegurar que la carpeta de destino exista
    !mkdir -p "$DRIVE_BASE/data"
    
    # Copiar resultados
    !cp data/*.csv "$DRIVE_BASE/data/"
    
    print("‚úÖ Resultados guardados en: ", DRIVE_BASE + "/data/")