
# 🩺 Diabetes Prediction from Health Data — End‑to‑End Notebook

This notebook builds a **binary classification** model to predict the onset of **diabetes** from diagnostic measurements (Pima Indians Diabetes dataset).  
It covers data loading, EDA, preprocessing, training multiple models, evaluation (ROC/PR curves), and feature importance.

> **Note:** The notebook will attempt to **download the dataset automatically**. If you are offline, place a file named `diabetes.csv` under a local `data/` folder and re‑run the notebook.


In [None]:

# === Imports & Config ===
import os
import io
import sys
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from urllib.request import urlopen

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, roc_curve,
                             precision_recall_curve, average_precision_score)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

pd.set_option('display.max_columns', None)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print('Versions:')
print('  Python  :', sys.version.split()[0])
import sklearn, matplotlib
print('  pandas  :', pd.__version__)
print('  numpy   :', np.__version__)
print('  sklearn :', sklearn.__version__)
print('  matplotlib:', matplotlib.__version__)


## 📥 Load Dataset

In [None]:

# We'll try multiple known public sources. If all fail, read from local 'data/diabetes.csv'
SOURCES = [
    # Has headers
    "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv",
    # No headers: we'll add them
    "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
]

COLUMNS = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin",
           "BMI","DiabetesPedigreeFunction","Age","Outcome"]

data_path = Path('data/diabetes.csv')
data_path.parent.mkdir(parents=True, exist_ok=True)

def try_download():
    for url in SOURCES:
        try:
            print(f'Trying {url} ...')
            with urlopen(url, timeout=15) as resp:
                text = resp.read().decode('utf-8')
            df = pd.read_csv(io.StringIO(text))
            # Handle the jbrownlee version with no header
            if df.shape[1] == 9 and list(df.columns) == list(range(9)):
                df.columns = COLUMNS
            # If headers present but order is known, align
            if set(df.columns) == set(COLUMNS):
                df = df[COLUMNS]
            # Save a local copy for reproducibility
            df.to_csv(data_path, index=False)
            print('Downloaded and saved to', data_path.resolve())
            return df
        except Exception as e:
            print('  Failed:', e)
    return None

if data_path.exists():
    df = pd.read_csv(data_path)
    print('Loaded local data from', data_path.resolve())
else:
    df = try_download()
    if df is None:
        raise RuntimeError(
            "Could not download dataset. Please place a file 'data/diabetes.csv' locally "
            "with columns: " + ", ".join(COLUMNS)
        )

df.head()


## 🔎 Exploratory Data Analysis

In [None]:

df.info()


In [None]:

df.describe().T


In [None]:

# Class distribution
ax = df['Outcome'].value_counts().sort_index().plot(kind='bar', color=['#6baed6','#fd8d3c'])
plt.title('Target Distribution (0 = No Diabetes, 1 = Diabetes)')
plt.xlabel('Outcome'); plt.ylabel('Count'); plt.show()
print('Positive rate:', df['Outcome'].mean().round(3))



## 🧼 Data Cleaning & Preprocessing

The Pima dataset encodes missing values as **0** in some medical fields.  
We'll replace zeros with `NaN` for these columns, then impute with the **median**:
- `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `BMI`


In [None]:

cols_with_missing_zero = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
df_clean = df.copy()
for c in cols_with_missing_zero:
    df_clean[c] = df_clean[c].replace(0, np.nan)

# Check missing counts
df_clean[cols_with_missing_zero].isna().sum()


## ✂️ Train / Test Split

In [None]:

X = df_clean.drop(columns=['Outcome'])
y = df_clean['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

X_train.shape, X_test.shape, y_train.mean().round(3), y_test.mean().round(3)


## 🧪 Models & Pipelines

In [None]:

numeric_features = X_train.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features)
    ]
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE),
    "RandomForest"     : RandomForestClassifier(n_estimators=300, max_depth=None, random_state=RANDOM_STATE, class_weight='balanced'),
    "GradientBoosting" : GradientBoostingClassifier(random_state=RANDOM_STATE)
}

pipelines = {name: Pipeline(steps=[("prep", preprocess), ("clf", model)]) for name, model in models.items()}

# Cross-validated ROC AUC scores
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = {}
for name, pipe in pipelines.items():
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=None)
    cv_scores[name] = (scores.mean(), scores.std())
    print(f"{name}: ROC AUC {scores.mean():.3f} ± {scores.std():.3f}")


## 🏁 Train Best Model & Evaluate on Test Set

In [None]:

best_name = max(cv_scores.items(), key=lambda kv: kv[1][0])[0]
best_pipe = pipelines[best_name]
print('Best model by CV ROC AUC:', best_name)
best_pipe.fit(X_train, y_train)

# Predictions & metrics
proba = best_pipe.predict_proba(X_test)[:,1]
preds = (proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_test, proba)
ap = average_precision_score(y_test, proba)
print(f"Test ROC AUC: {roc_auc:.3f}")
print(f"Test Avg Precision (PR AUC): {ap:.3f}")
print("\nClassification Report:\n", classification_report(y_test, preds, digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix — {best_name}')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.show()


## 📈 ROC & Precision‑Recall Curves

In [None]:

# ROC Curve
fpr, tpr, thr = roc_curve(y_test, proba)
plt.figure()
plt.plot(fpr, tpr, label=f'{best_name} (AUC={roc_auc:.3f})')
plt.plot([0,1], [0,1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve'); plt.legend(); plt.show()

# PR Curve
prec, rec, thr_pr = precision_recall_curve(y_test, proba)
plt.figure()
plt.plot(rec, prec, label=f'{best_name} (AP={ap:.3f})')
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision‑Recall Curve')
plt.legend(); plt.show()


## 🔍 Feature Importance (Permutation)

In [None]:

r = permutation_importance(best_pipe, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, scoring='roc_auc')
importances = pd.Series(r.importances_mean, index=[f'prep__num__{c}' for c in numeric_features])

# Map back to raw feature names (since ColumnTransformer wraps them)
clean_names = pd.Index(numeric_features)
imp = pd.Series(r.importances_mean[:len(clean_names)], index=clean_names).sort_values(ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(x=imp.values, y=imp.index, palette='viridis')
plt.title('Permutation Importance (decrease in ROC AUC)')
plt.xlabel('Mean Importance'); plt.ylabel('Feature')
plt.tight_layout(); plt.show()

imp.to_frame('importance').head(10)


## 💾 Save Trained Pipeline

In [None]:

import joblib
model_path = Path('artifacts'); model_path.mkdir(exist_ok=True)
outfile = model_path / f'diabetes_model_{best_name.replace(" ", "_").lower()}.joblib'
joblib.dump(best_pipe, outfile)
print('Saved pipeline to:', outfile.resolve())


## 🔮 Inference Helper

In [None]:

# Example: predict for a single patient
sample = {
    "Pregnancies": 2,
    "Glucose": 120,
    "BloodPressure": 70,
    "SkinThickness": 25,
    "Insulin": 79,
    "BMI": 28.5,
    "DiabetesPedigreeFunction": 0.45,
    "Age": 35
}
sample_df = pd.DataFrame([sample])
prob = best_pipe.predict_proba(sample_df)[0,1]
pred = int(prob >= 0.5)
print('Predicted probability of diabetes:', round(prob, 3), '=> class', pred)
sample_df



## 📚 Data Dictionary

- **Pregnancies**: Number of times pregnant  
- **Glucose**: Plasma glucose concentration (2 hours in an oral glucose tolerance test)  
- **BloodPressure**: Diastolic blood pressure (mm Hg)  
- **SkinThickness**: Triceps skinfold thickness (mm)  
- **Insulin**: 2-Hour serum insulin (mu U/ml)  
- **BMI**: Body mass index (weight in kg/(height in m)^2)  
- **DiabetesPedigreeFunction**: Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)  
- **Age**: Age (years)  
- **Outcome**: Class variable (0 or 1) — 1 indicates diabetes

**Acknowledgements:** Pima Indians Diabetes dataset (originally from the National Institute of Diabetes and Digestive and Kidney Diseases).
