# Diabetes Prediction - Model Comparison
This notebook compares several ML models to predict the likelihood of having diabetes.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Prepare the data

In [None]:
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train and evaluate models

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)

    results.append((name, acc, auc))
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, preds))
    print("-" * 40)

best_model_name, best_acc, best_auc = max(results, key=lambda x: x[2])
print(f"Best Model: {best_model_name} (AUC = {best_auc:.4f})")

best_model = models[best_model_name]


Model: Random Forest
Accuracy: 0.7208, AUC: 0.8120
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154

----------------------------------------
Model: Logistic Regression
Accuracy: 0.7468, AUC: 0.8129
              precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

----------------------------------------
Model: SVM
Accuracy: 0.7662, AUC: 0.8066
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        99
      

## Make a Prediction with Best Model

In [None]:
input_data = {
    "Pregnancies": 2,
    "Glucose": 120,
    "BloodPressure": 70,
    "SkinThickness": 20,
    "Insulin": 85,
    "BMI": 28.0,
    "DiabetesPedigreeFunction": 0.5,
    "Age": 35,
    "FamilyHistory": "Yes"  # We are not using this in model b/c it's not in the dataset, just going to drop it
}

input_df = pd.DataFrame([input_data])
input_features = input_df.drop(columns=["FamilyHistory"])

proba = best_model.predict_proba(input_features)[0][1]
print(f"Chance of having diabetes: {proba * 100:.2f}%")

prediction = best_model.predict(input_features)[0]
print("Prediction:", "Has diabetes" if prediction == 1 else "No diabetes")


Chance of having diabetes: 20.41%
Prediction: No diabetes


In [None]:
import joblib
joblib.dump(best_model, "best_diabetes_model.pkl")

['best_diabetes_model.pkl']