# Diabetes Dataset - Binary Classification Notebook with 5 Models

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Load and convert regression target to binary classification
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = (diabetes.target > 140).astype(int)  # 1 if disease progression is high, else 0
df.head()


In [None]:

print("Class Distribution (Proportion):")
print(df['target'].value_counts(normalize=True))

sns.countplot(x='target', data=df)
plt.title("Target Class Distribution (High vs Low Disease Progression)")
plt.show()

df.describe()


In [None]:

X = df.drop("target", axis=1)
y = df["target"]

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, X.columns.tolist())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [None]:

models = {
    'Logistic Regression': (
        LogisticRegression(max_iter=1000, class_weight='balanced'),
        {'clf__C': [0.1]}
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=42),
        {'clf__n_estimators': [100], 'clf__max_depth': [None]}
    ),
    'Gradient Boosting': (
        GradientBoostingClassifier(),
        {'clf__n_estimators': [100]}
    ),
    'SVC': (
        SVC(probability=True, class_weight='balanced'),
        {'clf__C': [1]}
    ),
    'KNN': (
        KNeighborsClassifier(),
        {'clf__n_neighbors': [5]}
    )
}


In [None]:

results = []

for model_name, (model, param_grid) in models.items():
    print(f"Training: {model_name}")
    start = time.time()

    pipe = ImbPipeline([
        ('preprocess', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('clf', model)
    ])

    grid = GridSearchCV(pipe, param_grid, cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    results.append((model_name, grid.best_params_, acc, prec, rec, f1, auc))

    print(f"Time: {time.time() - start:.2f} sec")
    print("Best Params:", grid.best_params_)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)
    print("ROC AUC:", auc)
    print()


In [None]:

results_df = pd.DataFrame(results, columns=[
    'Model', 'Best Params', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'
])
results_df.sort_values("F1 Score", ascending=False)
