In [None]:
# ------------------------------------------------------------
# DIABETES PREDICTION USING K-NEAREST NEIGHBORS (KNN)
# ------------------------------------------------------------
# Aim: Classify patients as Diabetic (1) or Not Diabetic (0)
# Steps: Load → Preprocess → Train-Test Split → Scale → Train KNN → Evaluate
# Metrics: Confusion Matrix, Accuracy, Error Rate, Precision, Recall + Report
# ------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score,
    classification_report
)

# Load dataset
df = pd.read_csv("diabetes.csv")
print("Dataset Shape:", df.shape)
print(df.head())


# -----------------------------
# 1. PRE-PROCESSING
# -----------------------------
'''This step cleans obvious invalid zeros in medical columns (treated as missing), 
replacing them with NaN and imputing median values for stability. Then we split 
features (X) and target (y), where Outcome is the binary label (1=diabetic, 0=non-diabetic).'''

# Treat zeros as missing for these columns and impute with median
zero_as_nan_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for c in zero_as_nan_cols:
    df[c] = df[c].replace(0, np.nan)
    df[c] = df[c].fillna(df[c].median())

# Features and target
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# -----------------------------
# 2. TRAIN-TEST SPLIT
# -----------------------------
'''train_test_split() holds out 20% data for unbiased evaluation. 
random_state=42 ensures reproducibility (same split each run). 
We also keep an unscaled copy (X_train_raw/X_test_raw) for readable sample outputs later.'''

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# -----------------------------
# 3. FEATURE SCALING
# -----------------------------
'''KNN is distance-based; features must be on comparable scales. StandardScaler 
standardizes each feature (mean=0, std=1). Fit on training only to avoid leakage.'''

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

# -----------------------------
# 4. MODEL TRAINING (KNN)
# -----------------------------
'''Initialize and train a K-Nearest Neighbors classifier. n_neighbors=5 is a common, 
robust default. The model learns by storing training samples and uses majority vote 
among the 5 closest neighbors during prediction.'''

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# -----------------------------
# 5. MODEL EVALUATION
# -----------------------------
'''Predict on test set and compute required metrics: 
- Confusion Matrix (TP, TN, FP, FN) 
- Accuracy and Error Rate (1 - accuracy) 
- Precision and Recall (positive class = 1). 
We also print a classification report (precision/recall/F1/support) for completeness.'''

y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
err = 1 - acc
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)

print("\n--- MODEL PERFORMANCE (KNN) ---")
print("Confusion Matrix:\n", cm)
print(f"Accuracy     : {acc:.4f}")
print(f"Error Rate   : {err:.4f}")
print(f"Precision    : {prec:.4f}")
print(f"Recall       : {rec:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# -----------------------------
# 6. SHOW SAMPLE PREDICTIONS
# -----------------------------
'''Display a small sample of test patients with Actual vs Predicted outcomes. 
We use the unscaled X_test_raw for readability and map labels 0/1 to text.'''

sample = X_test_raw.copy()
sample = sample.assign(Actual=y_test.values, Predicted=y_pred)
label_map = {0: "Not Diabetic", 1: "Diabetic"}
sample['Actual'] = sample['Actual'].map(label_map)
sample['Predicted'] = sample['Predicted'].map(label_map)

print("\nSample Predictions (first 10):")
print(sample.head(10))