In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [59]:
data=pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [60]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [61]:
X=data.drop("Outcome",axis=1)
y=data["Outcome"]

In [62]:
columns_with_zero = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in columns_with_zero:
    X[col] = X[col].replace(0, X[col].mean())

In [63]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.shape,y_train.shape

((614, 8), (614,))

In [65]:
def euclidean_distance(x1 , x2):
    return np.sqrt(np.sum((x1 - x2)**2))

In [66]:
class KNN:
    def __init__(self, k=5):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)
    
    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
     
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        
        most_common = Counter(k_labels).most_common(1)
        return most_common[0][0]

In [67]:
model = KNN(k=7)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [73]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7597402597402597

Confusion Matrix:
 [[84 15]
 [22 33]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.69      0.60      0.64        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154



# The KNN model achieved an accuracy of about 75â€“76%, showing good predictive performance on the diabetes dataset. Proper data preprocessing and feature normalization significantly improved the results. Some diabetic cases were misclassified, indicating limitations in recall. Overall, KNN serves as a simple and effective baseline model.