# ðŸ©º Diabetes Risk Prediction Model Training

This notebook documents the process of training the XGBoost classifier for the Diabetes Risk Assessment module.

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import shap

# Set style
sns.set_theme(style="whitegrid")

## 2. Data Loading & Preprocessing

In this section, we load the clinical dataset and perform basic cleanups.

In [None]:
# Generating a more realistic synthetic dataset based on Pima patterns
np.random.seed(42)
n = 2000
data = {
    'age': np.random.normal(45, 12, n).clip(18, 90),
    'gender': np.random.choice([0, 1], n),  # 0: Male, 1: Female
    'bmi': np.random.normal(28, 6, n).clip(15, 60),
    'glucose': np.random.normal(110, 35, n).clip(60, 300),
    'blood_pressure': np.random.normal(80, 15, n).clip(40, 180),
    'insulin': np.random.normal(100, 50, n).clip(0, 400),
    'family_history': np.random.choice([0, 1], n, p=[0.7, 0.3]),
    'physical_activity': np.random.choice([0, 1, 2], n), # 0: Low, 1: Med, 2: High
    'smoking': np.random.choice([0, 1, 2], n)
}
df = pd.DataFrame(data)

# Define Target: Risk increases with Glucose, BMI, and Age
logit = (0.05 * df['glucose'] + 0.08 * df['bmi'] + 0.02 * df['age'] + 0.5 * df['family_history'] - 10)
prob = 1 / (1 + np.exp(-logit))
df['target'] = (prob > 0.5).astype(int)

print(f"Database Shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

Understanding feature correlations.

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

## 4. Model Training

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train_scaled, y_train)

print("Model Training Complete.")

## 5. Evaluation

In [None]:
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]):.4f}")

## 6. Model Explainability (SHAP)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_scaled)

plt.title("Global Feature Importance (SHAP)")
shap.summary_plot(shap_values, X_test, plot_type="bar")