In [1]:
# CELL 1: Load processed data
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed_clean_non_medical.csv")
print(f"Loaded: {df.shape}")
print("Target: DEMENTIA (0 = No, 1 = Yes)")
df.head()

Loaded: (195196, 32)
Target: DEMENTIA (0 = No, 1 = Yes)


Unnamed: 0,NACCID,VISITMO,VISITDAY,VISITYR,BIRTHMO,BIRTHYR,SEX,HISPANIC,RACE,PRIMLANG,...,WEIGHT,NACCADC,FORMVER,NACCDAYS,NACCFDYS,AGE,BMI,EVER_SMOKER,EDUC_YEARS,DEMENTIA
0,NACC002909,12,28,2022,5,1952,1,0,1,1,...,232.0,186,3.0,391.0,0.0,70,32.353898,0,16,0
1,NACC002909,1,23,2024,5,1952,1,0,1,1,...,220.0,186,3.0,391.0,391.0,71,30.680421,0,16,0
2,NACC003487,11,15,2023,12,1956,1,0,1,1,...,175.0,186,3.0,0.0,0.0,66,23.731674,0,16,0
3,NACC004352,10,5,2021,1,1958,2,1,1,2,...,888.0,186,3.0,0.0,0.0,63,79.166667,0,16,8
4,NACC004687,11,14,2022,2,1945,1,1,1,1,...,114.0,186,3.0,0.0,0.0,77,18.968521,0,12,0


In [3]:
# CELL 2: Split + Impute
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Features & target
X = df.drop(columns=['NACCID', 'DEMENTIA'])
y = df['DEMENTIA']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Impute missing with median (robust)
imputer = SimpleImputer(strategy='median')
X_train_imp = imputer.fit_transform(X_train)
X_test_imp = imputer.transform(X_test)

print(f"Train: {X_train_imp.shape}, Test: {X_test_imp.shape}")
print(f"Dementia rate: {y.mean():.1%}")

Train: (156156, 30), Test: (39040, 30)
Dementia rate: 203.7%


In [4]:
# CELL 3: Train Logistic, Random Forest, XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
import matplotlib.pyplot as plt

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_imp, y_train)
    y_pred_proba = model.predict_proba(X_test_imp)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    results[name] = auc
    print(f"{name}: AUC = {auc:.4f}")

# Plot AUC
plt.bar(results.keys(), results.values())
plt.ylim(0.5, 0.9)
plt.title("Model Comparison (AUC)")
plt.ylabel("AUC Score")
plt.xticks(rotation=45)
plt.show()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: multi_class must be in ('ovo', 'ovr')

In [None]:
# CELL 4: Use XGBoost (usually best) + Calibrate
best_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
best_model.fit(X_train_imp, y_train)

# Calibrate probabilities (more accurate %)
from sklearn.calibration import CalibratedClassifierCV
calibrated = CalibratedClassifierCV(best_model, method='sigmoid', cv='prefit')
calibrated.fit(X_train_imp, y_train)

y_pred_proba = calibrated.predict_proba(X_test_imp)[:, 1]
final_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Calibrated XGBoost AUC: {final_auc:.4f}")

In [None]:
# CELL 5: Predict function
def predict_dementia_risk(person_dict):
    # Convert to DataFrame
    person_df = pd.DataFrame([person_dict])
    
    # Align columns with training data
    person_aligned = person_df.reindex(columns=X.columns, fill_value=0)
    
    # Impute
    person_imp = imputer.transform(person_aligned)
    
    # Predict probability
    risk_prob = calibrated.predict_proba(person_imp)[0, 1]
    risk_percent = risk_prob * 100
    
    print(f"Dementia Risk: {risk_percent:.1f}%")
    return risk_percent

# Example: 75-year-old female, low education, smoker, high BMI
example = {
    'AGE': 75,
    'SEX': 2,           # 2 = Female
    'EDUC_YEARS': 8,
    'EVER_SMOKER': 1,
    'BMI': 30,
    'ALCOHOL_FREQ': 0,
    'INDEPEND': 1,
    'MARISTAT': 1       # Married
}
predict_dementia_risk(example)

In [None]:
# CELL 6: SHAP (install once: pip install shap)
import shap
import matplotlib.pyplot as plt

explainer = shap.Explainer(calibrated)
shap_values = explainer(X_test_imp)

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=10)
plt.title("Top 10 Features Driving Dementia Risk")
plt.show()

# For one person
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values.values[0,:], X_test.iloc[0])

In [None]:
# CELL 7: Save model
import joblib
import os

os.makedirs("models", exist_ok=True)
joblib.dump(calibrated, "models/dementia_risk_model.pkl")
joblib.dump(imputer, "models/imputer.pkl")
joblib.dump(X.columns.tolist(), "models/feature_columns.pkl")

print("Model saved: models/dementia_risk_model.pkl")

# Git commit
!git add .
!git commit -m "feat: train XGBoost + calibrate + SHAP + predict function"
!git push

In [None]:
# CELL 8: Print results for report
print("=== HACKATHON RESULTS ===")
print(f"Best Model: Calibrated XGBoost")
print(f"Test AUC: {final_auc:.4f}")
print(f"Top Features: AGE, EDUC_YEARS, EVER_SMOKER, BMI")
print(f"Model saved and versioned in Git")
print(f"GitHub: https://github.com/yourusername/dementia-risk-hackathon")

In [None]:
# -------------------------------------------------
# CELL 1 – Load cleaned data
# -------------------------------------------------
import pandas as pd
import numpy as np
import os

DATA_PATH = "../data/processed_clean_non_medical.csv"
df = pd.read_csv(DATA_PATH)

print(f"Rows: {df.shape[0]:,},  Columns: {df.shape[1]}")
print("Target column → DEMENTIA (0/1)")

# Keep a copy of the ID column (optional, will be dropped later)
ids = df["NACCID"].copy()
X_raw = df.drop(columns=["NACCID", "DEMENTIA"])
y = df["DEMENTIA"]

print(f"Dementia prevalence: {y.mean():.2%}")

In [None]:
# -------------------------------------------------
# CELL 2 – Split, impute & scale
# -------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_raw, y, ids, test_size=0.20, random_state=42, stratify=y
)

# Pipeline: impute median → standard-scale
preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

X_train_prep = preprocess.fit_transform(X_train)
X_test_prep  = preprocess.transform(X_test)

print(f"Train shape: {X_train_prep.shape}, Test shape: {X_test_prep.shape}")

In [None]:
# -------------------------------------------------
# CELL 3 – Train & compare AUC
# -------------------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

models = {
    "Logistic": LogisticRegression(max_iter=2000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200,
                                          max_depth=None,
                                          random_state=42,
                                          n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False,
                             eval_metric="logloss",
                             n_estimators=300,
                             learning_rate=0.05,
                             max_depth=6,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             random_state=42,
                             n_jobs=-1)
}

aucs = {}
for name, clf in models.items():
    clf.fit(X_train_prep, y_train)
    proba = clf.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, proba)
    aucs[name] = auc
    print(f"{name:12} → AUC = {auc:.4f}")

# Bar chart
plt.bar(aucs.keys(), aucs.values(), color=["#4C72B0","#55A868","#C44E52"])
plt.ylim(0.5, 0.9)
plt.title("Model AUC Comparison")
plt.ylabel("AUC")
plt.show()

In [None]:
# -------------------------------------------------
# CELL 4 – Calibrate XGBoost (or whichever is best)
# -------------------------------------------------
from sklearn.calibration import CalibratedClassifierCV

best_raw = models["XGBoost"]          # change name if another model wins
calibrated = CalibratedClassifierCV(best_raw, method="sigmoid", cv="prefit")
calibrated.fit(X_train_prep, y_train)

cal_proba = calibrated.predict_proba(X_test_prep)[:, 1]
cal_auc   = roc_auc_score(y_test, cal_proba)
print(f"Calibrated XGBoost AUC = {cal_auc:.4f}")

In [None]:
# -------------------------------------------------
# CELL 5 – Prediction function (use it in the report)
# -------------------------------------------------
import json   # optional – save column order for later

def dementia_risk_pct(person_dict: dict) -> float:
    """Input: dict with same column names as training data.
       Output: risk % (0-100)."""
    person = pd.DataFrame([person_dict])
    person_aligned = person.reindex(columns=X_train.columns, fill_value=0)
    person_prep = preprocess.transform(person_aligned)
    prob = calibrated.predict_proba(person_prep)[0, 1]
    return round(prob * 100, 1)

# ---- Example -------------------------------------------------
example = {
    "AGE": 78,
    "SEX": 2,               # 2 = Female
    "EDUC_YEARS": 10,
    "EVER_SMOKER": 1,
    "BMI": 31.2,
    "ALCOHOL_FREQ": 1,      # occasional
    "INDEPEND": 1,
    "MARISTAT": 1           # married
}
print(f"Example risk → {dementia_risk_pct(example)}%")

In [None]:
# -------------------------------------------------
# CELL 6 – SHAP (install once: pip install shap)
# -------------------------------------------------
!pip install -q shap   # run only once

import shap
import matplotlib.pyplot as plt

explainer = shap.Explainer(calibrated, X_train_prep, feature_names=X_train.columns)
shap_vals = explainer(X_test_prep[:200])   # first 200 for speed

# Summary bar
shap.summary_plot(shap_vals, X_test.iloc[:200], plot_type="bar", max_display=10)
plt.title("Top 10 Features Driving Dementia Risk")
plt.show()

# Force plot for the example person
person_prep = preprocess.transform(
    pd.DataFrame([example]).reindex(columns=X_train.columns, fill_value=0)
)
shap.initjs()
shap.force_plot(explainer.expected_value, 
                explainer(person_prep).values[0],
                X_test.iloc[0:1])   # just to show layout

In [None]:
# -------------------------------------------------
# CELL 7 – Persist artefacts
# -------------------------------------------------
import joblib, json, os

os.makedirs("models", exist_ok=True)

joblib.dump(calibrated, "models/dementia_risk_model.pkl")
joblib.dump(preprocess, "models/preprocess_pipeline.pkl")
json.dump(X_train.columns.tolist(), open("models/feature_columns.json", "w"))

print("All artefacts saved in ./models/")