In [26]:
# In a Jupyter cell, prefix with ! 
!pip install --upgrade 'numpy<2' 'scipy<1.14' 'scikit-learn<1.4'


Collecting numpy<2
  Using cached numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14
  Using cached scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (60 kB)
Collecting scikit-learn<1.4
  Downloading scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Using cached numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl (20.6 MB)
Using cached scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl (39.3 MB)
Downloading scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Installing collected packages: numpy, scipy, scikit-learn
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 2.2.5
[2K    Uninstalling numpy-2.2.5:━━━━━━━━━━━━━━━━━━━[0m [32m0/3[0m [numpy]
[2K      Successfully uninstalled numpy-2.2.5━━━━━━[0m [32m0/3[0m [numpy]
[2K  Attempting uninstall: scipy━━━

In [3]:
# Cell 1: Imports and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix
from sklearn.utils import resample



# Load your data (adjust path as needed)
data_path = "/Users/hamidahmad/Desktop/Diabetes.csv"
df = pd.read_csv(data_path)


In [5]:
# Cell 2: Quick Data Inspection
print("Shape:", df.shape)
print("Dtypes:\n", df.dtypes)
print("Missing values per column:\n", df.isna().sum())
print("Diabetes distribution (normalized):\n", df['diabetes'].value_counts(normalize=True))


Shape: (100000, 9)
Dtypes:
 gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object
Missing values per column:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
Diabetes distribution (normalized):
 diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64


In [7]:
# Cell 3: Feature Lists & Optional Outlier Check
numeric_feats = [
    "age", "hypertension", "heart_disease", "bmi",
    "HbA1c_level", "blood_glucose_level"
]
categorical_feats = ["gender", "smoking_history"]
target = "diabetes"

# Optional: boxplots for numeric features
# import matplotlib.pyplot as plt
# for col in numeric_feats:
#     plt.figure()
#     df.boxplot(column=col)
#     plt.title(f"Boxplot of {col}")


In [9]:
# Cell 4: Preprocessing Pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_feats),
    ('cat', categorical_transformer, categorical_feats),
])


In [11]:
# Cell 5: Stratified Train/Val/Test Split
X = df[numeric_feats + categorical_feats]
y = df[target]

# 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
# Split temp into 15% val, 15% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, stratify=y_temp, test_size=0.5, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (70000, 8), Val: (15000, 8), Test: (15000, 8)


In [13]:
# Cell 6: Baseline Dummy Classifier
baseline_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', DummyClassifier(strategy='most_frequent'))
])

baseline_pipe.fit(X_train, y_train)
y_base_proba = baseline_pipe.predict_proba(X_test)[:, 1]
print("Baseline ROC-AUC:", roc_auc_score(y_test, y_base_proba))


Baseline ROC-AUC: 0.5


In [15]:
# Cell 7: RandomForest with Class Weights (GridSearchCV)
rf_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

param_grid_rf = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_leaf': [1, 5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs_rf = GridSearchCV(
    rf_pipe, param_grid_rf,
    cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1
)
gs_rf.fit(X_train, y_train)

print("Best RF params:", gs_rf.best_params_)
print("Validation ROC-AUC:", gs_rf.best_score_)
y_rf_proba = gs_rf.predict_proba(X_test)[:, 1]
print("Test ROC-AUC:", roc_auc_score(y_test, y_rf_proba))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best RF params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 5, 'clf__n_estimators': 200}
Validation ROC-AUC: 0.9751471736235476
Test ROC-AUC: 0.9767013393335475


In [17]:
# Cell 8: Manual Upsampling of Minority Class
# Combine train data
df_train = pd.concat([X_train, y_train.rename(target)], axis=1)
# Separate classes
df_majority = df_train[df_train[target] == 0]
df_minority = df_train[df_train[target] == 1]
# Upsample minority
df_minority_upsampled = resample(
    df_minority, replace=True,
    n_samples=len(df_majority), random_state=42
)
# Recombine
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

X_train_up = df_upsampled[numeric_feats + categorical_feats]
y_train_up = df_upsampled[target]

print("After upsampling, counts:")
print(y_train_up.value_counts())


After upsampling, counts:
diabetes
0    64050
1    64050
Name: count, dtype: int64


In [19]:
# Cell 9: RandomForest on Upsampled Data
rf_up_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid_up = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 5, 10],
}

gs_up = GridSearchCV(
    rf_up_pipe, param_grid_up,
    cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1
)
gs_up.fit(X_train_up, y_train_up)

print("Best Upsampled RF params:", gs_up.best_params_)
print("Validation ROC-AUC (upsampled):", gs_up.best_score_)
y_up_proba = gs_up.predict_proba(X_test)[:, 1]
print("Test ROC-AUC (upsampled):", roc_auc_score(y_test, y_up_proba))


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Upsampled RF params: {'clf__max_depth': None, 'clf__n_estimators': 200}
Validation ROC-AUC (upsampled): 0.999670683824905
Test ROC-AUC (upsampled): 0.96675629843923


In [25]:
# Cell 12: Threshold Tuning Table
from sklearn.metrics import precision_recall_curve
import pandas as pd
import numpy as np

# Get prediction probabilities for the positive class
probs = best_model.predict_proba(X_test)[:, 1]

# Compute precision, recall for many thresholds
precision, recall, thresholds = precision_recall_curve(y_test, probs)

# Align thresholds with the corresponding precision & recall
# (precision/recall arrays are one element longer than thresholds)
df_thresholds = pd.DataFrame({
    'threshold': thresholds,
    'precision': precision[1:],   # skip the first element (threshold=-inf)
    'recall':    recall[1:]
})

# Display the full table (or use .head() to show top 10)
df_thresholds


Unnamed: 0,threshold,precision,recall
0,0.001688,0.085006,1.000000
1,0.002901,0.085011,1.000000
2,0.003369,0.085068,1.000000
3,0.003471,0.085079,1.000000
4,0.003531,0.085170,1.000000
...,...,...,...
14061,0.994586,1.000000,0.003137
14062,0.994653,1.000000,0.002353
14063,0.995626,1.000000,0.001569
14064,0.996309,1.000000,0.000784
