In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

Load Dataset

In [None]:
df=pd.read_csv("diabetes.csv")

In [None]:
df.head()

Prepare Data

In [None]:
cols=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for c in cols:
    df[c]=df[c].replace(0,np.nan)
    df[c]=df[c].fillna(df[c].median())

In [None]:
df=df.drop_duplicates()

In [None]:
df.shape

Prepare Features and Target

In [None]:
X=df.drop('Outcome',axis=1)

In [None]:
y=df['Outcome']

Apply Feature Scaling

In [None]:
scaler=StandardScaler()

In [None]:
X_scaled=scaler.fit_transform(X)

In [None]:
X_scaled=pd.DataFrame(X_scaled,columns=X.columns)

Split Data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape,X_test.shape

Train All Models

In [None]:
lr=LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
knn=KNeighborsClassifier(n_neighbors=11)

In [None]:
knn.fit(X_train,y_train)

In [None]:
dt=DecisionTreeClassifier(random_state=42)

In [None]:
dt.fit(X_train,y_train)

Make Predictions

In [None]:
y_pred_lr=lr.predict(X_test)

In [None]:
y_pred_knn=knn.predict(X_test)

In [None]:
y_pred_dt=dt.predict(X_test)

Task 1: Compare Models Using Evaluation Metrics

In [None]:
accuracy_lr=accuracy_score(y_test,y_pred_lr)
accuracy_knn=accuracy_score(y_test,y_pred_knn)
accuracy_dt=accuracy_score(y_test,y_pred_dt)

In [None]:
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")

In [None]:
precision_lr=precision_score(y_test,y_pred_lr)
precision_knn=precision_score(y_test,y_pred_knn)
precision_dt=precision_score(y_test,y_pred_dt)

In [None]:
print(f"Logistic Regression Precision: {precision_lr:.4f}")
print(f"KNN Precision: {precision_knn:.4f}")
print(f"Decision Tree Precision: {precision_dt:.4f}")

In [None]:
recall_lr=recall_score(y_test,y_pred_lr)
recall_knn=recall_score(y_test,y_pred_knn)
recall_dt=recall_score(y_test,y_pred_dt)

In [None]:
print(f"Logistic Regression Recall: {recall_lr:.4f}")
print(f"KNN Recall: {recall_knn:.4f}")
print(f"Decision Tree Recall: {recall_dt:.4f}")

In [None]:
f1_lr=f1_score(y_test,y_pred_lr)
f1_knn=f1_score(y_test,y_pred_knn)
f1_dt=f1_score(y_test,y_pred_dt)

In [None]:
print(f"Logistic Regression F1-Score: {f1_lr:.4f}")
print(f"KNN F1-Score: {f1_knn:.4f}")
print(f"Decision Tree F1-Score: {f1_dt:.4f}")

In [None]:
comparison_df=pd.DataFrame({
    'Model':['Logistic Regression','KNN','Decision Tree'],
    'Accuracy':[accuracy_lr,accuracy_knn,accuracy_dt],
    'Precision':[precision_lr,precision_knn,precision_dt],
    'Recall':[recall_lr,recall_knn,recall_dt],
    'F1-Score':[f1_lr,f1_knn,f1_dt]
})

In [None]:
comparison_df

In [None]:
comparison_df.set_index('Model').plot(kind='bar',figsize=(12,6))
plt.ylabel('Score')
plt.title('Model Comparison - All Metrics')
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.ylim(0.5,1.0)

Compare Training vs Testing Performance

In [None]:
train_acc_lr=lr.score(X_train,y_train)
train_acc_knn=knn.score(X_train,y_train)
train_acc_dt=dt.score(X_train,y_train)

In [None]:
train_test_df=pd.DataFrame({
    'Model':['Logistic Regression','KNN','Decision Tree'],
    'Train Accuracy':[train_acc_lr,train_acc_knn,train_acc_dt],
    'Test Accuracy':[accuracy_lr,accuracy_knn,accuracy_dt]
})

In [None]:
train_test_df

In [None]:
train_test_df.set_index('Model').plot(kind='bar',figsize=(10,6))
plt.ylabel('Accuracy')
plt.title('Training vs Testing Accuracy')
plt.xticks(rotation=45)
plt.ylim(0.6,1.0)

Cross-Validation Scores

In [None]:
cv_lr=cross_val_score(lr,X_scaled,y,cv=5)

In [None]:
cv_lr

In [None]:
cv_lr.mean()

In [None]:
cv_knn=cross_val_score(knn,X_scaled,y,cv=5)

In [None]:
cv_knn

In [None]:
cv_knn.mean()

In [None]:
cv_dt=cross_val_score(dt,X_scaled,y,cv=5)

In [None]:
cv_dt

In [None]:
cv_dt.mean()

In [None]:
cv_df=pd.DataFrame({
    'Model':['Logistic Regression','KNN','Decision Tree'],
    'CV Mean':[cv_lr.mean(),cv_knn.mean(),cv_dt.mean()],
    'CV Std':[cv_lr.std(),cv_knn.std(),cv_dt.std()]
})

In [None]:
cv_df

Confusion Matrix Comparison

In [None]:
fig,axes=plt.subplots(1,3,figsize=(15,4))

sns.heatmap(confusion_matrix(y_test,y_pred_lr),annot=True,fmt='d',ax=axes[0],cmap='Blues')
axes[0].set_title('Logistic Regression')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(confusion_matrix(y_test,y_pred_knn),annot=True,fmt='d',ax=axes[1],cmap='Greens')
axes[1].set_title('KNN')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

sns.heatmap(confusion_matrix(y_test,y_pred_dt),annot=True,fmt='d',ax=axes[2],cmap='Oranges')
axes[2].set_title('Decision Tree')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')

plt.tight_layout()

Classification Reports

In [None]:
print("Logistic Regression:")
print(classification_report(y_test,y_pred_lr))

In [None]:
print("KNN:")
print(classification_report(y_test,y_pred_knn))

In [None]:
print("Decision Tree:")
print(classification_report(y_test,y_pred_dt))

ROC Curve and AUC Score

In [None]:
y_pred_proba_lr=lr.predict_proba(X_test)[:,1]
y_pred_proba_knn=knn.predict_proba(X_test)[:,1]
y_pred_proba_dt=dt.predict_proba(X_test)[:,1]

In [None]:
auc_lr=roc_auc_score(y_test,y_pred_proba_lr)
auc_knn=roc_auc_score(y_test,y_pred_proba_knn)
auc_dt=roc_auc_score(y_test,y_pred_proba_dt)

In [None]:
print(f"Logistic Regression AUC: {auc_lr:.4f}")
print(f"KNN AUC: {auc_knn:.4f}")
print(f"Decision Tree AUC: {auc_dt:.4f}")

In [None]:
fpr_lr,tpr_lr,_=roc_curve(y_test,y_pred_proba_lr)
fpr_knn,tpr_knn,_=roc_curve(y_test,y_pred_proba_knn)
fpr_dt,tpr_dt,_=roc_curve(y_test,y_pred_proba_dt)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(fpr_lr,tpr_lr,label=f'Logistic Regression (AUC={auc_lr:.3f})')
plt.plot(fpr_knn,tpr_knn,label=f'KNN (AUC={auc_knn:.3f})')
plt.plot(fpr_dt,tpr_dt,label=f'Decision Tree (AUC={auc_dt:.3f})')
plt.plot([0,1],[0,1],'k--',label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True)

Task 2: Select Best Model

In [None]:
best_accuracy=comparison_df.loc[comparison_df['Accuracy'].idxmax(),'Model']

In [None]:
print(f"Best Model by Accuracy: {best_accuracy}")

In [None]:
best_f1=comparison_df.loc[comparison_df['F1-Score'].idxmax(),'Model']

In [None]:
print(f"Best Model by F1-Score: {best_f1}")

In [None]:
best_auc_model=['Logistic Regression','KNN','Decision Tree'][np.argmax([auc_lr,auc_knn,auc_dt])]

In [None]:
print(f"Best Model by AUC: {best_auc_model}")

In [None]:
comparison_df.sort_values('Accuracy',ascending=False)

Task 3: Summarize Results and Observations

In [None]:
summary_df=pd.DataFrame({
    'Model':['Logistic Regression','KNN','Decision Tree'],
    'Test Accuracy':[accuracy_lr,accuracy_knn,accuracy_dt],
    'Train Accuracy':[train_acc_lr,train_acc_knn,train_acc_dt],
    'CV Mean':[cv_lr.mean(),cv_knn.mean(),cv_dt.mean()],
    'F1-Score':[f1_lr,f1_knn,f1_dt],
    'AUC':[auc_lr,auc_knn,auc_dt]
})

In [None]:
summary_df

In [None]:
summary_df['Overfitting']=summary_df['Train Accuracy']-summary_df['Test Accuracy']

In [None]:
summary_df

In [None]:
summary_df.sort_values('Test Accuracy',ascending=False)