In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.tree import plot_tree

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Load Dataset

In [None]:
df=pd.read_csv("diabetes.csv")

In [None]:
df.head()

Prepare Data (Clean from Day 4)

In [None]:
cols=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for c in cols:
    df[c]=df[c].replace(0,np.nan)
    df[c]=df[c].fillna(df[c].median())

In [None]:
df=df.drop_duplicates()

In [None]:
df.head()

Prepare Features and Target

In [None]:
X=df.drop('Outcome',axis=1)

In [None]:
y=df['Outcome']

In [None]:
X.head()

Apply Feature Scaling

In [None]:
scaler=StandardScaler()

In [None]:
X_scaled=scaler.fit_transform(X)

In [None]:
X_scaled=pd.DataFrame(X_scaled,columns=X.columns)

In [None]:
X_scaled.head()

Split Data into Training and Testing Sets

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

Task 1: Train K-Nearest Neighbors Model

In [None]:
knn=KNeighborsClassifier()

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_pred_knn=knn.predict(X_test)

In [None]:
y_pred_knn

In [None]:
accuracy_score(y_test,y_pred_knn)

Task 2: Tune Value of K

In [None]:
k_values=range(1,31)

In [None]:
train_scores=[]
test_scores=[]
for k in k_values:
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

In [None]:
train_scores

In [None]:
test_scores

In [None]:
plt.plot(k_values,train_scores,label='Train')
plt.plot(k_values,test_scores,label='Test')
plt.xlabel('K Value')
plt.ylabel('Accuracy')
plt.legend()
plt.title('KNN Performance vs K Value')

In [None]:
best_k=k_values[np.argmax(test_scores)]

In [None]:
best_k

In [None]:
max(test_scores)

Task 3: Train KNN with Best K and Evaluate

In [None]:
knn_best=KNeighborsClassifier(n_neighbors=best_k)

In [None]:
knn_best.fit(X_train,y_train)

In [None]:
y_pred_knn_best=knn_best.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred_knn_best)

In [None]:
confusion_matrix(y_test,y_pred_knn_best)

In [None]:
print(classification_report(y_test,y_pred_knn_best))

In [None]:
knn_best.score(X_train,y_train)

In [None]:
knn_best.score(X_test,y_test)

Task 4: Train Decision Tree Classifier

In [None]:
dt=DecisionTreeClassifier()

In [None]:
dt.fit(X_train,y_train)

In [None]:
y_pred_dt=dt.predict(X_test)

In [None]:
y_pred_dt

In [None]:
dt.feature_importances_

In [None]:
pd.DataFrame({'Feature':X.columns,'Importance':dt.feature_importances_}).sort_values('Importance',ascending=False)

Task 5: Visualize and Analyze Tree Structure

In [None]:
plt.figure(figsize=(20,10))
plot_tree(dt,feature_names=X.columns,class_names=['No Diabetes','Diabetes'],filled=True)
plt.show()

In [None]:
dt.get_depth()

In [None]:
dt.get_n_leaves()

In [None]:
plt.figure(figsize=(20,10))
plot_tree(dt,feature_names=X.columns,class_names=['No Diabetes','Diabetes'],filled=True,max_depth=3)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.barh(X.columns,dt.feature_importances_)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance - Decision Tree')

Task 6: Evaluate Decision Tree Performance

In [None]:
accuracy_score(y_test,y_pred_dt)

In [None]:
confusion_matrix(y_test,y_pred_dt)

In [None]:
print(classification_report(y_test,y_pred_dt))

In [None]:
dt.score(X_train,y_train)

In [None]:
dt.score(X_test,y_test)

Compare All Models

In [None]:
models={'KNN':knn_best.score(X_test,y_test),'Decision Tree':dt.score(X_test,y_test)}

In [None]:
models

In [None]:
plt.figure(figsize=(8,5))
plt.bar(models.keys(),models.values())
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.ylim(0.6,1.0)