In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report

RANDOM_STATE = 42
DATA_PATH = "../data/exercise_2_shap.csv"       # dataset location
LABEL_COL = "adherence_predicted"       # binary target (0/1)

FEATURE_COLS = [
   "num_past_iits", "prev_iit_status", "past_encounters",
    "CD4_Count", "Viral_Load", "Current_WHO_HIV_Stage",
    "time_since_diagnosis_at_scheduled_appointment",
    "age_at_encounter", "gender", "Current Regimen Line",
    "TPT Outcome", "NCDs", "Establishment"
]


In [19]:
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if LABEL_COL not in df.columns:
        raise ValueError(f"Label column '{LABEL_COL}' not found in dataset.")
    return df

def thin_to_features(df: pd.DataFrame) -> pd.DataFrame:
    # Keep only curated features + label; ignore any that might be missing
    present = [c for c in FEATURE_COLS if c in df.columns]
    missing = [c for c in FEATURE_COLS if c not in df.columns]
    if missing:
        print(f"[info] missing features not in dataset (skipped): {missing}")
    if len(present) < 8:
        raise ValueError(f"Too few selected features present ({len(present)}). Expected up to 15.")
    cols = present + [LABEL_COL]
    return df[cols].copy()

def prepare_xy(df: pd.DataFrame):
    X = df.drop(columns=[LABEL_COL], errors="ignore")
    y = df[LABEL_COL].astype(int)

    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]
    return X, y, num_cols, cat_cols

def build_model(num_cols, cat_cols) -> Pipeline:
    preproc = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="most_frequent"), num_cols),
            ("cat", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe",  OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ]), cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=True,  
    )

    clf = RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )

    pipe = Pipeline(steps=[
        ("preproc", preproc),
        ("rf", clf),
    ])
    return pipe

def print_metrics(y_true, y_pred, header="Test Metrics"):
    print(f"\n=== {header} ===")
    pr, rc, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro", zero_division=0
    )
    print(f"Macro  P: {pr:.3f} | R: {rc:.3f} | F1: {f1:.3f}")
    pr_w, rc_w, f1_w, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted", zero_division=0
    )
    print(f"Weight P: {pr_w:.3f} | R: {rc_w:.3f} | F1: {f1_w:.3f}")
    print("\nClassification report:\n")
    print(classification_report(y_true, y_pred, zero_division=0))

def train_and_eval(df: pd.DataFrame):
    X, y, num_cols, cat_cols = prepare_xy(df)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
    )

    pipeline = build_model(num_cols, cat_cols)
    pipeline.fit(X_train, y_train)

    # Evaluate on TEST set 
    y_pred = pipeline.predict(X_test)
    print_metrics(y_test, y_pred, header="Test Metrics")

    preproc = pipeline.named_steps["preproc"]
    try:
        feature_names = preproc.get_feature_names_out()
    except Exception:
        feature_names = np.array(num_cols + [f"{c}__OHE" for c in cat_cols])

    return pipeline, X_train, X_test, y_train, y_test, feature_names


In [20]:
df_raw = load_data(DATA_PATH)
df = thin_to_features(df_raw)

pipeline, X_train, X_test, y_train, y_test, feature_names = train_and_eval(df)




=== Test Metrics ===
Macro  P: 0.995 | R: 0.985 | F1: 0.990
Weight P: 0.992 | R: 0.992 | F1: 0.992

Classification report:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        91
           1       1.00      0.97      0.99        34

    accuracy                           0.99       125
   macro avg       0.99      0.99      0.99       125
weighted avg       0.99      0.99      0.99       125



### Continue SHAP analysis from here ###