# 03 — Model Training (Task 1 & Task 2)

In [None]:
from pathlib import Path
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

CWD = Path.cwd()
ROOT = CWD if (CWD / 'src').exists() else CWD.parent
if not (ROOT / 'src').exists():
    if (CWD.parent / 'src').exists():
        ROOT = CWD.parent
    elif (CWD.parent.parent / 'src').exists():
        ROOT = CWD.parent.parent
sys.path.insert(0, str(ROOT))
print(f'Using project ROOT: {ROOT}')
from joblib import dump
import yaml
from lightgbm import LGBMClassifier
from pathlib import Path
DATA_PATH = ROOT / 'data' / 'raw' / 'Participant_Selection_Final.csv'
CONFIG_DIR = ROOT / 'config'
ARTIFACTS_DIR = ROOT / 'artifacts' / 'models'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
from src.data.preprocess import load, preprocess, drop_non_numeric_noise
from src.features.build_features import engineer_features
from src.models.stacking import fit_base_models, fit_meta
from src.labels.build_task2_labels import compute_percentiles, build_multilabel, write_thresholds_yaml, collapse_priority
from src.eval.metrics import auc_pr, auc_roc, brier, find_best_threshold
with open(CONFIG_DIR / 'params.yaml','r') as f:
    CFG = yaml.safe_load(f)
df = engineer_features(preprocess(load(str(DATA_PATH))))
display(df.head())

## Task 1 — Eligibility (Stacked)

In [None]:
y = (df['Participant_Selected_For_AID'] == 'Yes').astype(int)
X = drop_non_numeric_noise(df).select_dtypes(include=['number','bool']).drop(columns=['Participant_Selected_For_AID'], errors='ignore')
oof, base_models = fit_base_models(X, y, CFG['eligibility']['base_models'], seed=CFG['seed'], n_splits=CFG['cv_folds'])
meta = fit_meta(oof, y)
print('AUC-PR:', auc_pr(y, oof.mean(axis=1)))
print('AUC-ROC:', auc_roc(y, oof.mean(axis=1)))
thr, val = find_best_threshold(y, oof.mean(axis=1), target='f1')
print('Best F1 threshold:', thr, 'score:', val)
print('Brier:', brier(y, oof.mean(axis=1)))

final_base = {}
for k, ms in base_models.items():
    m = ms[-1]; m.random_state = CFG['seed']; m.fit(X, y); final_base[k] = m
import numpy as np
stack_full = np.column_stack([
    final_base['lgbm'].predict_proba(X)[:,1],
    final_base['xgb'].predict_proba(X)[:,1],
    final_base['rf'].predict_proba(X)[:,1],
    final_base['logreg'].predict_proba(X)[:,1],
])
meta.fit(stack_full, y)
dump(list(X.columns), ARTIFACTS_DIR / 'eligibility_features.joblib')
dump(final_base, ARTIFACTS_DIR / 'eligibility_base_models.joblib')
dump(meta, ARTIFACTS_DIR / 'eligibility_meta.joblib')
with open(CONFIG_DIR / 'thresholds.yaml','r') as f:
    TH = yaml.safe_load(f)
TH['eligibility_threshold'] = float(thr)
with open(CONFIG_DIR / 'thresholds.yaml','w') as f:
    yaml.safe_dump(TH, f, sort_keys=False)
print('✅ Saved eligibility artifacts to', ARTIFACTS_DIR)

## Task 2 — Aid Recommendation (priority-collapsed multiclass)

In [None]:
thr = compute_percentiles(df)
write_thresholds_yaml(str(CONFIG_DIR / 'thresholds.yaml'), thr)
import yaml as _y
with open(CONFIG_DIR / 'thresholds.yaml','r') as f:
    T = _y.safe_load(f)
df_lab = build_multilabel(df.copy(), thresholds=T, k_triggers=T.get('cash_grant_k_triggers',2))
order = T.get('priority_order', ['health_support','cash_grant','livelihood_asset','training'])
df_lab['Aid_Target'] = df_lab.apply(lambda r: collapse_priority(r, order), axis=1)
y2 = df_lab['Aid_Target']
X2 = drop_non_numeric_noise(df_lab).select_dtypes(include=['number','bool']).drop(columns=['Participant_Selected_For_AID'], errors='ignore')
aid = LGBMClassifier(objective='multiclass', random_state=CFG['seed'], **CFG['aid']['lgbm'])
aid.fit(X2, y2)
from joblib import dump
dump(aid, ARTIFACTS_DIR / 'aid_recommendation.joblib')
dump(list(X2.columns), ARTIFACTS_DIR / 'aid_features.joblib')
print('✅ Saved aid artifacts to', ARTIFACTS_DIR)