
# Diabetes Prediction Project – Corrected, Interview-Ready Notebook

This notebook walks through:

- Exploratory Data Analysis (EDA)
- Visualizations and plots
- Data cleaning and preprocessing
- Feature engineering
- Training and evaluating multiple ML models
- Hyperparameter tuning
- Probability calibration
- Exporting a final model and scaler for deployment (e.g., Streamlit app)

The goal is **not just accuracy**, but to clearly show the **thinking process** for interviewers.


In [1]:

# Imports and setup

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action="ignore")

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier
import joblib

sns.set()
plt.style.use("ggplot")
%matplotlib inline


## 1. Load and inspect the dataset


In [None]:

# Load the dataset
df = pd.read_csv("../data/diabetes.csv")

# Quick look
df.head()


In [None]:

# Info and summary statistics
df.info()


In [None]:

df.describe()


In [None]:

# Shape (rows, columns)
df.shape


In [None]:

# Distribution of the target variable
df['Outcome'].value_counts() * 100 / len(df)


## 2. Exploratory Data Analysis (EDA) and Plots


In [None]:

# Age distribution
plt.figure(figsize=(8, 7))
plt.xlabel("Age", fontsize=10)
plt.ylabel("Count", fontsize=10)
df["Age"].hist(edgecolor="black")
plt.title("Age Distribution")
plt.show()


In [None]:

df["Age"].min(), df["Age"].max()


In [None]:

# Density plots for numeric features
fig, ax = plt.subplots(4, 2, figsize=(20, 20))

sns.distplot(df.Pregnancies, bins=20, ax=ax[0,0], color="red")
ax[0,0].set_title("Pregnancies")

sns.distplot(df.Glucose, bins=20, ax=ax[0,1], color="red")
ax[0,1].set_title("Glucose")

sns.distplot(df.BloodPressure, bins=20, ax=ax[1,0], color="red")
ax[1,0].set_title("Blood Pressure")

sns.distplot(df.SkinThickness, bins=20, ax=ax[1,1], color="red")
ax[1,1].set_title("Skin Thickness")

sns.distplot(df.Insulin, bins=20, ax=ax[2,0], color="red")
ax[2,0].set_title("Insulin")

sns.distplot(df.BMI, bins=20, ax=ax[2,1], color="red")
ax[2,1].set_title("BMI")

sns.distplot(df.DiabetesPedigreeFunction, bins=20, ax=ax[3,0], color="red")
ax[3,0].set_title("Diabetes Pedigree Function")

sns.distplot(df.Age, bins=20, ax=ax[3,1], color="red")
ax[3,1].set_title("Age")

plt.tight_layout()
plt.show()


In [None]:

# Mean feature values grouped by Outcome (0 = no diabetes, 1 = diabetes)
df.groupby("Outcome").agg({
    'Pregnancies':'mean',
    'Glucose':'mean',
    'BloodPressure':'mean',
    'SkinThickness':'mean',
    'Insulin':'mean',
    'BMI':'mean',
    'DiabetesPedigreeFunction':'mean',
    'Age':'mean'
})


In [None]:

# Outcome distribution: pie chart & countplot
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df['Outcome'].value_counts().plot.pie(
    explode=[0, 0.1],
    autopct="%1.1f%%",
    shadow=True,
    ax=ax[0]
)
ax[0].set_title("Outcome (Pie Chart)")
ax[0].set_ylabel("")

sns.countplot(x="Outcome", data=df, ax=ax[1])
ax[1].set_title("Outcome (Countplot)")

plt.show()


In [None]:

# Correlation matrix
df.corr()


In [None]:

# Correlation heatmap
f, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="magma", ax=ax)
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()


## 3. Data Cleaning: Replace Zeros, Impute Missing, Handle Outliers


In [None]:

# Treat zeros as missing values for selected columns
zero_as_na_cols = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
]

df[zero_as_na_cols] = df[zero_as_na_cols].replace(0, np.nan)
df.isnull().sum()


In [None]:

# Function to compute median per Outcome
def median_target(var):
    temp = df[df[var].notnull()]
    temp = temp[[var, "Outcome"]].groupby("Outcome")[[var]].median().reset_index()
    return temp

# Median imputation by Outcome
columns = df.columns.drop("Outcome")
for col in columns:
    med = median_target(col)
    df.loc[(df["Outcome"] == 0) & (df[col].isnull()), col] = med[col][0]
    df.loc[(df["Outcome"] == 1) & (df[col].isnull()), col] = med[col][1]

df.isnull().sum()


In [None]:

df.head()


In [None]:

# Pairplot to visually inspect relationships between features and Outcome
sns.pairplot(df, hue="Outcome")
plt.show()


In [None]:

# Outlier detection with IQR rule (just to flag which features have potential outliers)
for feature in df.columns:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    if df[(df[feature] > upper)].any(axis=None):
        print(feature, "has potential high-end outliers")
    else:
        print(feature, "no major high-end outliers flagged")


In [None]:

# Boxplot for Insulin before capping
plt.figure(figsize=(8, 7))
sns.boxplot(x=df["Insulin"], color="red")
plt.title("Insulin Before Capping")
plt.show()


In [None]:

# Cap Insulin using IQR rule
Q1 = df["Insulin"].quantile(0.25)
Q3 = df["Insulin"].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 + 1.5 * IQR

df.loc[df["Insulin"] > upper, "Insulin"] = upper

# Boxplot for Insulin after capping
plt.figure(figsize=(8, 7))
sns.boxplot(x=df["Insulin"], color="red")
plt.title("Insulin After Capping")
plt.show()


In [None]:

# Optional: Boxplot for Pregnancies to inspect distribution
plt.figure(figsize=(8, 7))
sns.boxplot(x=df["Pregnancies"], color="red")
plt.title("Pregnancies Distribution")
plt.show()


In [None]:

# Remove global outliers using Local Outlier Factor (LOF) on features only (exclude Outcome)
lof = LocalOutlierFactor(n_neighbors=10)
lof_scores = lof.fit_predict(df.drop(columns=["Outcome"]))

# -1 = outlier, 1 = inlier
mask_inliers = lof_scores == 1
df = df[mask_inliers].reset_index(drop=True)

df.shape


## 4. Feature Engineering: BMI, Insulin Score, Glucose Categories


In [None]:

# BMI categories using standard WHO-ish cutoffs
df["NewBMI"] = pd.cut(
    df["BMI"],
    bins=[0, 18.5, 24.9, 29.9, 34.9, 39.9, np.inf],
    labels=[
        "Underweight",
        "Normal",
        "Overweight",
        "Obesity 1",
        "Obesity 2",
        "Obesity 3",
    ],
)
df["NewBMI"].value_counts()


In [None]:

# Insulin score: Normal vs Abnormal
def set_insulin(row):
    if 16 <= row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"

df["NewInsulinScore"] = df.apply(set_insulin, axis=1)
df["NewInsulinScore"].value_counts()


In [None]:

# Glucose categories with meaningful medical cutoffs
# - Low:        <= 70
# - Normal:      71–99
# - Prediabetic: 100–125
# - High:       >= 126
df["NewGlucose"] = pd.cut(
    df["Glucose"],
    bins=[0, 70, 99, 125, np.inf],
    labels=["Low", "Normal", "Prediabetic", "High"],
)
df["NewGlucose"].value_counts()


In [None]:

df.head()


In [None]:

# One-hot encode engineered categorical features
df = pd.get_dummies(
    df,
    columns=["NewBMI", "NewInsulinScore", "NewGlucose"],
    drop_first=True,
    dtype=int,
)

df.head()


## 5. Train/Test Split and Scaling


In [None]:

# Separate features and target
y = df["Outcome"]
X = df.drop(columns=["Outcome"])

feature_names = X.columns.tolist()

# Train/test split with stratification on Outcome
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

# Standard scaling (note: this scaler will be saved for deployment)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape


## 6. Model Training and Evaluation


In [None]:

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_log = log_reg.predict(X_test)
log_reg_acc = accuracy_score(y_test, y_pred_log)

print("Logistic Regression Accuracy (test):", log_reg_acc)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))


In [None]:

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
knn_acc = accuracy_score(y_test, y_pred_knn)

print("KNN Accuracy (test):", knn_acc)
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


In [None]:

# Support Vector Machine (SVM) with GridSearchCV
svc = SVC(probability=True)

svc_param_grid = {
    "gamma": [0.0001, 0.001, 0.01, 0.1],
    "C": [0.01, 0.05, 0.5, 1, 10, 15, 20, 50, 100],
    "kernel": ["rbf", "linear"],
}

grid_search_svc = GridSearchCV(svc, svc_param_grid, cv=5, n_jobs=-1)
grid_search_svc.fit(X_train, y_train)

print("Best SVC Params:", grid_search_svc.best_params_)
print("Best CV Score (SVC):", grid_search_svc.best_score_)

svc_best = grid_search_svc.best_estimator_
y_pred_svc = svc_best.predict(X_test)
svc_acc = accuracy_score(y_test, y_pred_svc)

print("SVC Accuracy (test):", svc_acc)
print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))


In [None]:

# Decision Tree Classifier + GridSearchCV
dt = DecisionTreeClassifier(random_state=42)

dt_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7, 10],
    "splitter": ["best", "random"],
    "min_samples_leaf": [1, 2, 3, 5, 7],
    "min_samples_split": [2, 3, 5, 7],
    "max_features": ["auto", "sqrt", "log2"],
}

grid_search_dt = GridSearchCV(dt, dt_param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

print("Best DT Params:", grid_search_dt.best_params_)
print("Best CV Score (DT):", grid_search_dt.best_score_)

dt_best = grid_search_dt.best_estimator_
y_pred_dt = dt_best.predict(X_test)
dt_acc = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Accuracy (test):", dt_acc)
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


In [None]:

# Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=7,
    min_samples_leaf=3,
    min_samples_split=5,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)

print("Random Forest Accuracy (test):", rf_acc)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:

# XGBoost Classifier with RandomizedSearchCV
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
)

param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.2, 0.4],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0, 0.01, 0.1, 1],
    "reg_lambda": [1, 1.5, 2, 3],
}

random_search_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)

random_search_xgb.fit(X_train, y_train)

print("Best XGBoost Params:", random_search_xgb.best_params_)
print("Best CV Score (XGBoost):", random_search_xgb.best_score_)

xgb_best = random_search_xgb.best_estimator_

y_pred_xgb = xgb_best.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)

print("XGBoost Accuracy (test):", xgb_acc)
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


## 7. Probability Calibration (for Better Probabilities)


In [None]:

# Calibrate the best XGBoost model using isotonic regression
calibrated_model = CalibratedClassifierCV(
    estimator=xgb_best,
    cv=5,
    method="isotonic",
)

calibrated_model.fit(X_train, y_train)

y_pred_cal = calibrated_model.predict(X_test)
cal_acc = accuracy_score(y_test, y_pred_cal)

print("Calibrated XGBoost Accuracy (test):", cal_acc)
print(confusion_matrix(y_test, y_pred_cal))
print(classification_report(y_test, y_pred_cal))


## 8. Model Comparison


In [None]:

# Compare models in a DataFrame
models_df = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "KNN",
        "SVC (Best GridSearch)",
        "Decision Tree (Best GridSearch)",
        "Random Forest",
        "XGBoost (Best)",
        "Calibrated XGBoost",
    ],
    "Accuracy": [
        log_reg_acc,
        knn_acc,
        svc_acc,
        dt_acc,
        rf_acc,
        xgb_acc,
        cal_acc,
    ],
})

models_df.sort_values(by="Accuracy", ascending=False)


## 9. Save Final Scaler and Model for Deployment


In [None]:

# Save the StandardScaler and calibrated XGBoost model
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(calibrated_model, "../models/model.pkl")

print("Scaler and calibrated model saved to ../models/")
