# 02_baseline_transformer_beto â€” Binario A/D

**Objetivo:** baseline con **transformer en espaÃ±ol** (*roberta-bne* o equivalente) para contrastar con TFâ€‘IDF y reglas.  
**JustificaciÃ³n:** modelos preentrenados capturan semÃ¡ntica y contexto; con preprocesamiento conservador suelen superar a mÃ©todos clÃ¡sicos cuando hay suficiente seÃ±al.


In [1]:
# === Paths / Globals (auto-detect) ===
from pathlib import Path
import pandas as pd
import re, unicodedata, os

# Rutas y entorno
BASE_PATH = Path.cwd()
if BASE_PATH.name == "notebooks":
    BASE_PATH = BASE_PATH.parent

DATA_PATH = BASE_PATH / "data"
FORK_PATH = BASE_PATH / "Spanish_Psych_Phenotyping_PY"

# Reuse existing globals if present (from your 02_baselines.ipynb)
DATA_PATH = Path(DATA_PATH) if 'DATA_PATH' in globals() else Path('data')
FORK_PATH = Path(FORK_PATH) if 'FORK_PATH' in globals() else Path('Spanish_Psych_Phenotyping_PY')
DATA_PATH.mkdir(exist_ok=True)


INPUT_FILE   = DATA_PATH/'ips_raw.csv'
if not INPUT_FILE.exists():
    raise FileNotFoundError("No se encontrÃ³ ni ips_clean.csv ni ips_raw.csv en " + str(DATA_PATH))

print("ðŸ“¥ INPUT_FILE:", INPUT_FILE)

# --- Columnas reales de tu dataset ---
TEXT_COL = "texto"
LABEL_COL = "etiqueta"

# Column preferences (honor globals if defined)
TEXT_COL  = TEXT_COL  if 'TEXT_COL'  in globals() else None
LABEL_COL = LABEL_COL if 'LABEL_COL' in globals() else None

def _guess_text_col(df):
    if TEXT_COL and TEXT_COL in df.columns: 
        return TEXT_COL
    for c in ['texto','Motivo Consulta','original_motivo_consulta','text']:
        if c in df.columns: return c
    for c in df.columns:
        if df[c].dtype == 'O': return c
    raise ValueError("No se encontrÃ³ columna de texto.")

def _guess_label_col(df):
    if LABEL_COL and LABEL_COL in df.columns: 
        return LABEL_COL
    for c in ['etiqueta','Tipo','label','target','y','clase']:
        if c in df.columns: return c
    return None

def _norm_label_bin(s):
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    return {'depresivo':'depresion'}.get(s, s)


ðŸ“¥ INPUT_FILE: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/ips_raw.csv


## 1) Carga y preprocesamiento **conservador** (preserva tildes/casing)

In [2]:
import pandas as pd, re, unicodedata

# === CARGAR SPLITS UNIFICADOS ===
SPLITS_PATH = DATA_PATH / "splits"
dataset_base = pd.read_csv(SPLITS_PATH / 'dataset_base.csv')
train_indices = pd.read_csv(SPLITS_PATH / 'train_indices.csv')['row_id'].values
val_indices = pd.read_csv(SPLITS_PATH / 'val_indices.csv')['row_id'].values

print(f"âœ… Splits cargados: Train={len(train_indices)} | Val={len(val_indices)}")

text_col  = _guess_text_col(dataset_base)
label_col = _guess_label_col(dataset_base)

# Transformer: conservador (mantener tildes/casing; sÃ³lo alargamientos/espacios)
RE_MULTI = re.compile(r'(.)\1{2,}')
def clean_text_trf(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFC", s)
    s = RE_MULTI.sub(r'\1\1', s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dataset_base['texto_trf'] = dataset_base[text_col].map(clean_text_trf)
label2id = {'depresion':0, 'ansiedad':1}
dataset_base['label'] = dataset_base[label_col].map(label2id)

# Separar train y val usando Ã­ndices guardados
train_df = dataset_base[dataset_base['row_id'].isin(train_indices)][['texto_trf','label']].copy()
val_df = dataset_base[dataset_base['row_id'].isin(val_indices)][['texto_trf','label']].copy()

print(f"Train: {len(train_df)} | Val: {len(val_df)}")
print(f"Train distribuciÃ³n:\n{train_df['label'].value_counts()}")
print(f"Val distribuciÃ³n:\n{val_df['label'].value_counts()}")

âœ… Splits cargados: Train=2518 | Val=630
Train: 2518 | Val: 630
Train distribuciÃ³n:
label
0    1778
1     740
Name: count, dtype: int64
Val distribuciÃ³n:
label
0    445
1    185
Name: count, dtype: int64
Train: 2518 | Val: 630
Train distribuciÃ³n:
label
0    1778
1     740
Name: count, dtype: int64
Val distribuciÃ³n:
label
0    445
1    185
Name: count, dtype: int64


## 2) TokenizaciÃ³n y datasets

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL = "dccuchile/bert-base-spanish-wwm-cased"  # EspaÃ±ol cased
tok = AutoTokenizer.from_pretrained(MODEL)

def preprocess(batch):
    return tok(batch["texto_trf"], truncation=True, padding=False, max_length=256)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True)).map(preprocess, batched=True, remove_columns=["texto_trf"])
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True)).map(preprocess, batched=True, remove_columns=["texto_trf"])

collator = DataCollatorWithPadding(tokenizer=tok)

Map:   0%|          | 0/2518 [00:00<?, ? examples/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

## 3) Entrenamiento y evaluaciÃ³n

In [4]:
import evaluate, numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label = {0:'depresion', 1:'ansiedad'}
label2id = {'depresion':0, 'ansiedad':1}

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2, id2label=id2label, label2id=label2id)

metric_f1   = evaluate.load("f1")
metric_prec = evaluate.load("precision")
metric_rec  = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "macro_f1":   metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "macro_precision": metric_prec.compute(predictions=preds, references=labels, average="macro")["precision"],
        "macro_recall":    metric_rec.compute(predictions=preds, references=labels, average="macro")["recall"],
    }

args = TrainingArguments(
    output_dir="runs/beto_ad",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    seed=42,
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate()

import pandas as pd
(pd.DataFrame([eval_res]).to_csv(DATA_PATH/'beto_eval.csv', index=False, encoding='utf-8'))
print("âœ… Eval guardada:", DATA_PATH/'beto_eval.csv')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Macro Precision,Macro Recall
1,0.3678,0.311753,0.80747,0.865743,0.780443
2,0.2647,0.292184,0.852454,0.853017,0.851898
3,0.1366,0.315057,0.855857,0.869106,0.845339


âœ… Eval guardada: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/beto_eval.csv


## 4) Reporte detallado y predicciones

In [5]:
import numpy as np, pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

pred_logits = trainer.predict(val_ds).predictions
pred_ids = pred_logits.argmax(axis=-1)
y_true = val_df["label"].to_numpy()

# Exportables
beto_pred_csv   = DATA_PATH/'beto_predictions.csv'
beto_report_csv = DATA_PATH/'beto_classification_report.csv'
beto_eval_csv   = DATA_PATH/'beto_eval.csv'  # ya creado arriba
beto_cm_csv     = DATA_PATH/'beto_confusion_matrix.csv'

pd.DataFrame(classification_report(y_true, pred_ids, target_names=['depresion','ansiedad'], output_dict=True, zero_division=0))  .transpose().to_csv(beto_report_csv, index=True, encoding='utf-8')

cm = confusion_matrix(y_true, pred_ids, labels=[0,1])
pd.DataFrame(cm, index=['true_depresion','true_ansiedad'], columns=['pred_depresion','pred_ansiedad']).to_csv(beto_cm_csv)

# Con textos (Ãºtil para anÃ¡lisis de errores)
val_out = val_df.copy()
val_out["y_true"] = val_out["label"].map({0:"depresion",1:"ansiedad"})
val_out["y_pred"] = [ {0:"depresion",1:"ansiedad"}[i] for i in pred_ids ]
val_out.drop(columns=["label"]).to_csv(beto_pred_csv, index=False, encoding="utf-8")

print("âœ… Exportados:")
print(" - Predicciones:", beto_pred_csv)
print(" - Reporte:", beto_report_csv)
print(" - Eval:", beto_eval_csv)
print(" - Matriz:", beto_cm_csv)

âœ… Exportados:
 - Predicciones: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/beto_predictions.csv
 - Reporte: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/beto_classification_report.csv
 - Eval: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/beto_eval.csv
 - Matriz: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/beto_confusion_matrix.csv
