In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

print("Base model training")
for name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(name, model)

Base model training

Logistic Regression Evaluation:
Accuracy: 0.7468
Precision: 0.6379
Recall: 0.6727
F1 Score: 0.6549

Random Forest Evaluation:
Accuracy: 0.7403
Precision: 0.6415
Recall: 0.6182
F1 Score: 0.6296

SVM Evaluation:
Accuracy: 0.7662
Precision: 0.7209
Recall: 0.5636
F1 Score: 0.6327


In [7]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
evaluate_model("Tuned Random Forest", best_rf)


Tuned Random Forest Evaluation:
Accuracy: 0.7597
Precision: 0.6607
Recall: 0.6727
F1 Score: 0.6667


In [8]:
from scipy.stats import loguniform
param_dist_svc = {
    'C': loguniform(0.1, 100),
    'gamma': loguniform(0.001, 0.1),
    'kernel': ['rbf']
}
random_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=10,
                                cv=5, scoring='f1', random_state=42)
random_svc.fit(X_train, y_train)
best_svc = random_svc.best_estimator_
evaluate_model("Tuned SVM", best_svc)


Tuned SVM Evaluation:
Accuracy: 0.6429
Precision: 0.5000
Recall: 0.4545
F1 Score: 0.4762


In [9]:
print("\nFinal Comparison:")
models_final = {
    "Logistic Regression": models["Logistic Regression"],
    "Tuned Random Forest": best_rf,
    "Tuned SVM": best_svc
}
for name, model in models_final.items():
    evaluate_model(name, model)


Final Comparison:

Logistic Regression Evaluation:
Accuracy: 0.7468
Precision: 0.6379
Recall: 0.6727
F1 Score: 0.6549

Tuned Random Forest Evaluation:
Accuracy: 0.7597
Precision: 0.6607
Recall: 0.6727
F1 Score: 0.6667

Tuned SVM Evaluation:
Accuracy: 0.6429
Precision: 0.5000
Recall: 0.4545
F1 Score: 0.4762
