# 02_baseline_rule_based ‚Äî Binario A/D

**Objetivo:** baseline **rule-based** usando el fork del proyecto colombiano (solo **Ansiedad/Depresi√≥n**) para obtener una primera l√≠nea de referencia.  
**Justificaci√≥n:** las reglas permiten:
- establecer un punto de partida interpretable (trazabilidad por JSON/patrones),
- detectar fallos sistem√°ticos del dataset (typos, negaci√≥n, expresiones locales),
- guiar el dise√±o del *cleaning* y la selecci√≥n de fenotipos relevantes para A/D.

> Nota: mantenemos **preprocesamiento ligero** para no romper *ConText* ni *TargetMatcher*.


In [9]:
# === Paths / Globals (auto-detect) ===
from pathlib import Path
import pandas as pd
import re, unicodedata, os

# Rutas y entorno
BASE_PATH = Path.cwd()
if BASE_PATH.name == "notebooks":
    BASE_PATH = BASE_PATH.parent

DATA_PATH = BASE_PATH / "data"
FORK_PATH = BASE_PATH / "Spanish_Psych_Phenotyping_PY"

# Reuse existing globals if present (from your 02_baselines.ipynb)
DATA_PATH = Path(DATA_PATH) if 'DATA_PATH' in globals() else Path('data')
FORK_PATH = Path(FORK_PATH) if 'FORK_PATH' in globals() else Path('Spanish_Psych_Phenotyping_PY')
DATA_PATH.mkdir(exist_ok=True)

print("üìÅ DATA_PATH:", DATA_PATH)
print("üìÅ FORK_PATH:", FORK_PATH)
print("‚ÑπÔ∏è  Este baseline carga datos desde data/splits/ (generados por 02_create_splits.ipynb)")

# --- Columnas esperadas en dataset_base.csv ---
TEXT_COL = "texto"
LABEL_COL = "etiqueta"

# Column preferences (honor globals if defined)
TEXT_COL  = TEXT_COL  if 'TEXT_COL'  in globals() else None
LABEL_COL = LABEL_COL if 'LABEL_COL' in globals() else None

def _guess_text_col(df):
    if TEXT_COL and TEXT_COL in df.columns: 
        return TEXT_COL
    for c in ['texto','Motivo Consulta','original_motivo_consulta','text']:
        if c in df.columns: return c
    for c in df.columns:
        if df[c].dtype == 'O': return c
    raise ValueError("No se encontr√≥ columna de texto.")

def _guess_label_col(df):
    if LABEL_COL and LABEL_COL in df.columns: 
        return LABEL_COL
    for c in ['etiqueta','Tipo','label','target','y','clase']:
        if c in df.columns: return c
    return None

def _norm_label_bin(s):
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    return {'depresivo':'depresion'}.get(s, s)


üìÅ DATA_PATH: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data
üìÅ FORK_PATH: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY
‚ÑπÔ∏è  Este baseline carga datos desde data/splits/ (generados por 02_create_splits.ipynb)


## 1) Carga y preprocesamiento **ligero** (conserva tildes y casing, colapsa alargamientos)

In [10]:
import pandas as pd, re, unicodedata

# === CARGAR SPLITS UNIFICADOS ===
SPLITS_PATH = DATA_PATH / "splits"
dataset_base = pd.read_csv(SPLITS_PATH / 'dataset_base.csv')
train_indices = pd.read_csv(SPLITS_PATH / 'train_indices.csv')['row_id'].values
val_indices = pd.read_csv(SPLITS_PATH / 'val_indices.csv')['row_id'].values

print(f"‚úÖ Splits cargados: Train={len(train_indices)} | Val={len(val_indices)}")

text_col  = _guess_text_col(dataset_base)
label_col = _guess_label_col(dataset_base)

# Preprocesamiento ligero para rule-based (conserva tildes/casing, colapsa alargamientos)
_RE_MULTI = re.compile(r'(.)\1{2,}')
def clean_text_rb(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFC", s)
    s = _RE_MULTI.sub(r'\1\1', s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dataset_base['texto_rb'] = dataset_base[text_col].map(clean_text_rb)

# Separar train y val usando √≠ndices guardados
df_train = dataset_base[dataset_base['row_id'].isin(train_indices)].copy()
df_val = dataset_base[dataset_base['row_id'].isin(val_indices)].copy()

print(f"Train distribuci√≥n:\n{df_train[label_col].value_counts()}")
print(f"Val distribuci√≥n:\n{df_val[label_col].value_counts()}")

‚úÖ Splits cargados: Train=2500 | Val=625
Train distribuci√≥n:
etiqueta
depresion    1760
ansiedad      740
Name: count, dtype: int64
Val distribuci√≥n:
etiqueta
depresion    440
ansiedad     185
Name: count, dtype: int64
Train distribuci√≥n:
etiqueta
depresion    1760
ansiedad      740
Name: count, dtype: int64
Val distribuci√≥n:
etiqueta
depresion    440
ansiedad     185
Name: count, dtype: int64


## 2) Ejecutar fork (perfil `col`) con solo Ansiedad/Depresi√≥n

In [11]:
import sys, subprocess, yaml
from pathlib import Path

cfg_dir = FORK_PATH/'configs'
cfg_dir.mkdir(parents=True, exist_ok=True)
col_cfg = cfg_dir/'col_config.yml'
fenos_yml = cfg_dir/'fenotipos.yml'

# Forzar solo Ansiedad/Depresion en el fork
cfg = {}
if col_cfg.exists():
    cfg = yaml.safe_load(col_cfg.read_text(encoding='utf-8')) or {}
cfg['text_column'] = 'texto_rb'
col_cfg.write_text(yaml.safe_dump(cfg, allow_unicode=True), encoding='utf-8')

fen = {}
if fenos_yml.exists():
    fen = yaml.safe_load(fenos_yml.read_text(encoding='utf-8')) or {}
fen['active_concepts'] = ['Ansiedad','Depresion']
fenos_yml.write_text(yaml.safe_dump(fen, allow_unicode=True), encoding='utf-8')

cli_py = FORK_PATH/'cli.py'
main_py = FORK_PATH/'main.py'
runner = cli_py if cli_py.exists() else main_py
assert runner.exists(), "No se encontr√≥ cli.py ni main.py en el fork."

# Crear temp input solo con val set (para evaluar)
tmp_in = DATA_PATH/'ips_clean_tmp.csv'
df_val[['texto_rb', label_col]].rename(columns={'texto_rb':'texto_rb'}).to_csv(tmp_in, index=False, encoding='utf-8')

# Salidas estandarizadas (comparables)
rule_pred_csv   = DATA_PATH/'rule_based_predictions.csv'
rule_report_csv = DATA_PATH/'rule_based_classification_report.csv'
rule_eval_csv   = DATA_PATH/'rule_based_eval.csv'
rule_cm_csv     = DATA_PATH/'rule_based_confusion_matrix.csv'

cmd = [sys.executable, str(runner), '--profile','col', '--config', str(col_cfg),
       '--input', str(tmp_in), '--output', str(rule_pred_csv)]
print("CMD:", " ".join(map(str,cmd)))
ret = subprocess.run(cmd, check=False, capture_output=True, text=True)
print(ret.stdout)
if ret.returncode != 0:
    print(ret.stderr)
    raise RuntimeError(f"CLI termin√≥ con c√≥digo {ret.returncode}")

CMD: /opt/anaconda3/bin/python /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY/cli.py --profile col --config /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY/configs/col_config.yml --input /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/ips_clean_tmp.csv --output /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/rule_based_predictions.csv
Components in NLP pipeline:
	- tok2vec
	- morphologizer
	- attribute_ruler
	- lemmatizer
	- medspacy_pyrush
	- medspacy_target_matcher
	- medspacy_context
Concepts included (by folder): Ansiedad, Depresion
Rule categories loaded: Abulia, Abusodesustancias, Agitacinpsicomotora, Alteracindelapercepcindepesoofiguracorporal, AngustiaMiedoTemor, Anhedonia, Animodeprimido, Animoexpansivo, Ansiedad, Apata, Apetitoaumentode, Apetitodisminucinde, Autolesin, Bajaconcentracin, Bajaenerga, Culpa, Desesperanza, Efectosadversos, Fatiga, Ideacinsuicida, Ideasdemuerte, Intent

## 3) Evaluaci√≥n **binaria** y exportables

In [12]:
import pandas as pd, unicodedata as _ud
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix

preds = pd.read_csv(rule_pred_csv)
if 'pred_label' not in preds.columns:
    raise ValueError("El output del fork no contiene 'pred_label'.")

def _norm_txt(s):
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = _ud.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    return s

y_pred = preds['pred_label'].map(_norm_txt)
y_true = df_val[label_col].map(_norm_txt)

# En caso de que el fork devuelva algo fuera de A/D, lo mapeamos a la clase mayoritaria para evaluar binario
allowed = {'ansiedad','depresion'}
majority = y_true.value_counts().idxmax()
y_pred = y_pred.where(y_pred.isin(allowed), majority)

classes = ['depresion','ansiedad']

pd.DataFrame(classification_report(y_true, y_pred, labels=classes, output_dict=True, zero_division=0)).transpose()  .to_csv(rule_report_csv, index=True, encoding='utf-8')

pd.DataFrame([{
    'macro_f1': f1_score(y_true, y_pred, average='macro', zero_division=0),
    'macro_precision': precision_score(y_true, y_pred, average='macro', zero_division=0),
    'macro_recall': recall_score(y_true, y_pred, average='macro', zero_division=0),
    'n': int(len(y_true))
}]).to_csv(rule_eval_csv, index=False, encoding='utf-8')

cm = confusion_matrix(y_true, y_pred, labels=classes)
pd.DataFrame(cm, index=[f'true_{c}' for c in classes], columns=[f'pred_{c}' for c in classes]).to_csv(rule_cm_csv)

print("‚úÖ Exportados:")
print(" - Predicciones:", rule_pred_csv)
print(" - Reporte:", rule_report_csv)
print(" - M√©tricas:", rule_eval_csv)
print(" - Matriz:", rule_cm_csv)

‚úÖ Exportados:
 - Predicciones: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/rule_based_predictions.csv
 - Reporte: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/rule_based_classification_report.csv
 - M√©tricas: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/rule_based_eval.csv
 - Matriz: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/rule_based_confusion_matrix.csv
