In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [25]:
# Load dataset from local storage
data = pd.read_csv('diabetes.csv')

# Display first few rows
print(data.head())

# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Get dataset summary
print(data.describe())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Missing Values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
   

In [13]:
# Replace zeros with NaN in selected columns
columns_to_replace = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
data[columns_to_replace] = data[columns_to_replace].replace(0, np.nan)

# Fill missing values with column mean
data.fillna(data.mean(), inplace=True)

# Confirm no missing values
print("Missing values after handling:\n", data.isnull().sum())


Missing values after handling:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [17]:
# Define features (X) and target variable (y)
X = data.drop(columns=['Outcome'])  # All columns except 'Outcome'
y = data['Outcome']  # Target variable

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values (important for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [19]:
# Define and train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict on test data
y_pred_knn = knn.predict(X_test)

# Evaluate performance
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


KNN Accuracy: 0.6948051948051948
KNN Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.58      0.51      0.54        55

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.66       154
weighted avg       0.69      0.69      0.69       154

KNN Confusion Matrix:
 [[79 20]
 [27 28]]


In [21]:
# Define and train Naïve Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict on test data
y_pred_nb = nb.predict(X_test)

# Evaluate performance
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naïve Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Naïve Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


Naïve Bayes Accuracy: 0.7662337662337663
Naïve Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

Naïve Bayes Confusion Matrix:
 [[79 20]
 [16 39]]


In [23]:
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Naïve Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")


KNN Accuracy: 0.6948
Naïve Bayes Accuracy: 0.7662
