In [None]:
# -------------------------------------------------
# CELL 1 – Load cleaned data
# -------------------------------------------------
import pandas as pd
import numpy as np
import os

DATA_PATH = "../data/processed_clean_non_medical.csv"
df = pd.read_csv(DATA_PATH)

print(f"Rows: {df.shape[0]:,},  Columns: {df.shape[1]}")
print("Target column → DEMENTIA (0/1)")

# Keep a copy of the ID column (optional, will be dropped later)
ids = df["NACCID"].copy()
X_raw = df.drop(columns=["NACCID", "DEMENTIA"])
y = df["DEMENTIA"]

print(f"Dementia prevalence: {y.mean():.2%}")

In [None]:
# -------------------------------------------------
# CELL 2 – Split, impute & scale
# -------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_raw, y, ids, test_size=0.20, random_state=42, stratify=y
)

# Pipeline: impute median → standard-scale
preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

X_train_prep = preprocess.fit_transform(X_train)
X_test_prep  = preprocess.transform(X_test)

print(f"Train shape: {X_train_prep.shape}, Test shape: {X_test_prep.shape}")

In [None]:
# -------------------------------------------------
# CELL 3 – Train & compare AUC
# -------------------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

models = {
    "Logistic": LogisticRegression(max_iter=2000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200,
                                          max_depth=None,
                                          random_state=42,
                                          n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False,
                             eval_metric="logloss",
                             n_estimators=300,
                             learning_rate=0.05,
                             max_depth=6,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             random_state=42,
                             n_jobs=-1)
}

aucs = {}
for name, clf in models.items():
    clf.fit(X_train_prep, y_train)
    proba = clf.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, proba)
    aucs[name] = auc
    print(f"{name:12} → AUC = {auc:.4f}")

# Bar chart
plt.bar(aucs.keys(), aucs.values(), color=["#4C72B0","#55A868","#C44E52"])
plt.ylim(0.5, 0.9)
plt.title("Model AUC Comparison")
plt.ylabel("AUC")
plt.show()

In [None]:
# -------------------------------------------------
# CELL 4 – Calibrate XGBoost (or whichever is best)
# -------------------------------------------------
from sklearn.calibration import CalibratedClassifierCV

best_raw = models["XGBoost"]          # change name if another model wins
calibrated = CalibratedClassifierCV(best_raw, method="sigmoid", cv="prefit")
calibrated.fit(X_train_prep, y_train)

cal_proba = calibrated.predict_proba(X_test_prep)[:, 1]
cal_auc   = roc_auc_score(y_test, cal_proba)
print(f"Calibrated XGBoost AUC = {cal_auc:.4f}")

In [None]:
# -------------------------------------------------
# CELL 5 – Prediction function (use it in the report)
# -------------------------------------------------
import json   # optional – save column order for later

def dementia_risk_pct(person_dict: dict) -> float:
    """Input: dict with same column names as training data.
       Output: risk % (0-100)."""
    person = pd.DataFrame([person_dict])
    person_aligned = person.reindex(columns=X_train.columns, fill_value=0)
    person_prep = preprocess.transform(person_aligned)
    prob = calibrated.predict_proba(person_prep)[0, 1]
    return round(prob * 100, 1)

# ---- Example -------------------------------------------------
example = {
    "AGE": 78,
    "SEX": 2,               # 2 = Female
    "EDUC_YEARS": 10,
    "EVER_SMOKER": 1,
    "BMI": 31.2,
    "ALCOHOL_FREQ": 1,      # occasional
    "INDEPEND": 1,
    "MARISTAT": 1           # married
}
print(f"Example risk → {dementia_risk_pct(example)}%")

In [None]:
# -------------------------------------------------
# CELL 6 – SHAP (install once: pip install shap)
# -------------------------------------------------
!pip install -q shap   # run only once

import shap
import matplotlib.pyplot as plt

explainer = shap.Explainer(calibrated, X_train_prep, feature_names=X_train.columns)
shap_vals = explainer(X_test_prep[:200])   # first 200 for speed

# Summary bar
shap.summary_plot(shap_vals, X_test.iloc[:200], plot_type="bar", max_display=10)
plt.title("Top 10 Features Driving Dementia Risk")
plt.show()

# Force plot for the example person
person_prep = preprocess.transform(
    pd.DataFrame([example]).reindex(columns=X_train.columns, fill_value=0)
)
shap.initjs()
shap.force_plot(explainer.expected_value, 
                explainer(person_prep).values[0],
                X_test.iloc[0:1])   # just to show layout

In [None]:
# -------------------------------------------------
# CELL 7 – Persist artefacts
# -------------------------------------------------
import joblib, json, os

os.makedirs("models", exist_ok=True)

joblib.dump(calibrated, "models/dementia_risk_model.pkl")
joblib.dump(preprocess, "models/preprocess_pipeline.pkl")
json.dump(X_train.columns.tolist(), open("models/feature_columns.json", "w"))

print("All artefacts saved in ./models/")

In [None]:
# -------------------------------------------------
# CELL 8 – Git (run in VS Code terminal, NOT notebook)
# -------------------------------------------------
git add .
git commit -m "feat: final calibrated XGBoost + SHAP + prediction function"
git push