# 02_baseline_tfidf â€” Binario A/D

**Objetivo:** baseline clÃ¡sico **robusto a ruido** (typos/transcripciÃ³n) con **char TFâ€‘IDF (3â€“5)** + **SVM (LinearSVC)**.  
**JustificaciÃ³n:** los nâ€‘gramas de caracteres capturan patrones ortogrÃ¡ficos aun con errores; es un buen contrapunto al enfoque ruleâ€‘based.


In [28]:
# === Paths / Globals (auto-detect) ===
from pathlib import Path
import pandas as pd
import re, unicodedata, os

# Rutas y entorno
BASE_PATH = Path.cwd()
if BASE_PATH.name == "notebooks":
    BASE_PATH = BASE_PATH.parent

DATA_PATH = BASE_PATH / "data"
FORK_PATH = BASE_PATH / "Spanish_Psych_Phenotyping_PY"

# Reuse existing globals if present (from your 02_baselines.ipynb)
DATA_PATH = Path(DATA_PATH) if 'DATA_PATH' in globals() else Path('data')
FORK_PATH = Path(FORK_PATH) if 'FORK_PATH' in globals() else Path('Spanish_Psych_Phenotyping_PY')
DATA_PATH.mkdir(exist_ok=True)


INPUT_FILE   = DATA_PATH/'ips_raw.csv'
if not INPUT_FILE.exists():
    raise FileNotFoundError("No se encontrÃ³ ni ips_clean.csv ni ips_raw.csv en " + str(DATA_PATH))

print("ðŸ“¥ INPUT_FILE:", INPUT_FILE)

# --- Columnas reales de tu dataset ---
TEXT_COL = "texto"
LABEL_COL = "etiqueta"

# Column preferences (honor globals if defined)
TEXT_COL  = TEXT_COL  if 'TEXT_COL'  in globals() else None
LABEL_COL = LABEL_COL if 'LABEL_COL' in globals() else None

def _guess_text_col(df):
    if TEXT_COL and TEXT_COL in df.columns: 
        return TEXT_COL
    for c in ['texto','Motivo Consulta','original_motivo_consulta','text']:
        if c in df.columns: return c
    for c in df.columns:
        if df[c].dtype == 'O': return c
    raise ValueError("No se encontrÃ³ columna de texto.")

def _guess_label_col(df):
    if LABEL_COL and LABEL_COL in df.columns: 
        return LABEL_COL
    for c in ['etiqueta','Tipo','label','target','y','clase']:
        if c in df.columns: return c
    return None

def _norm_label_bin(s):
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    return {'depresivo':'depresion'}.get(s, s)


ðŸ“¥ INPUT_FILE: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/ips_raw.csv


## 1) Carga y preprocesamiento **agresivo** (pensado para ML clÃ¡sico)

In [None]:
df_base = pd.read_csv(INPUT_FILE)
text_col  = _guess_text_col(df_base)
label_col = _guess_label_col(df_base)
if label_col is None:
    raise ValueError("Se requiere columna de etiquetas para entrenar TFâ€‘IDF baseline.")

df = df_base.dropna(subset=[text_col, label_col]).copy()
df[label_col] = df[label_col].map(_norm_label_bin)
df = df[df[label_col].isin(['ansiedad','depresion'])].copy()

# TF-IDF: agresivo (lower, sÃ­mbolos, alargamientos, proxy negaciÃ³n 'no_X')
RE_MULTI = re.compile(r'(.)\1{2,}')
def clean_text_ml(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).lower().strip()
    s = unicodedata.normalize("NFC", s)
    s = RE_MULTI.sub(r'\1\1', s)
    s = re.sub(r"[^a-z0-9Ã¡Ã©Ã­Ã³ÃºÃ¼Ã±\s.,!?:/\-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\bno\s+([a-zÃ¡Ã©Ã­Ã³ÃºÃ¼Ã±]{2,})", r"no_\1", s)  # negaciÃ³n simple
    return s

df['texto_ml'] = df[text_col].map(clean_text_ml)
print("Etiquetas:", df[label_col].value_counts())

Etiquetas: Tipo
depresion    2223
ansiedad      925
Name: count, dtype: int64


## 2) Split estratificado y entrenamiento (char TFâ€‘IDF + LinearSVC)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

X = df['texto_ml']; y = df[label_col]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tfidf_char = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3,5),
    min_df=2,
    max_df=0.95
)

clf = Pipeline([
    ('tfidf', tfidf_char),
    ('svm', LinearSVC(class_weight='balanced', random_state=42))
])

clf.fit(X_train, y_train)
pred = clf.predict(X_val)
print("Entrenamiento OK.")

Entrenamiento OK.


## 3) MÃ©tricas y exportables

In [31]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix

tfidf_pred_csv   = DATA_PATH/'tfidf_predictions.csv'
tfidf_report_csv = DATA_PATH/'tfidf_classification_report.csv'
tfidf_eval_csv   = DATA_PATH/'tfidf_eval.csv'
tfidf_cm_csv     = DATA_PATH/'tfidf_confusion_matrix.csv'

classes = ['depresion','ansiedad']

pd.DataFrame(classification_report(y_val, pred, labels=classes, output_dict=True, zero_division=0))  .transpose().to_csv(tfidf_report_csv, index=True, encoding='utf-8')

pd.DataFrame([{
    'macro_f1': f1_score(y_val, pred, average='macro', zero_division=0),
    'macro_precision': precision_score(y_val, pred, average='macro', zero_division=0),
    'macro_recall': recall_score(y_val, pred, average='macro', zero_division=0),
    'n': int(len(y_val))
}]).to_csv(tfidf_eval_csv, index=False, encoding='utf-8')

cm = confusion_matrix(y_val, pred, labels=classes)
pd.DataFrame(cm, index=[f'true_{c}' for c in classes], columns=[f'pred_{c}' for c in classes]).to_csv(tfidf_cm_csv)

pd.DataFrame({'texto': X_val, 'y_true': y_val, 'y_pred': pred}).to_csv(tfidf_pred_csv, index=False, encoding='utf-8')

print("âœ… Exportados:")
print(" - Predicciones:", tfidf_pred_csv)
print(" - Reporte:", tfidf_report_csv)
print(" - MÃ©tricas:", tfidf_eval_csv)
print(" - Matriz:", tfidf_cm_csv)

âœ… Exportados:
 - Predicciones: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/tfidf_predictions.csv
 - Reporte: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/tfidf_classification_report.csv
 - MÃ©tricas: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/tfidf_eval.csv
 - Matriz: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/tfidf_confusion_matrix.csv
