<a href="https://colab.research.google.com/github/muajnstu/Comparative-Analysis-of-K-Nearest-Neighbors-Variants-for-Diabetes-Prediction-Using-Administrative-He/blob/main/10_fold_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
%pip install shap



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from imblearn.over_sampling import SMOTE

In [None]:
# Load data
df = pd.read_csv('https://media.githubusercontent.com/media/shahriariit/opendataset/refs/heads/master/DBSCAN_DATA.csv')
X = df.drop(columns=['Cluster'])
y = df['Cluster']

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# KNN variant definitions
def get_knn_variants(X_train):
    covariance_matrix = np.cov(X_train.T)
    stabilized_covariance_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
    inv_covariance_matrix = np.linalg.inv(stabilized_covariance_matrix)
    variance_vector = np.var(X_train, axis=0)
    knn_variants = {
        "KNN": KNeighborsClassifier(n_neighbors=3),
        "DistanceKNN": KNeighborsClassifier(n_neighbors=3, weights='distance'),
        "GeneralizedKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3),
        "EuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
        "ManhattanKNN": KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
        "ChebyshevKNN": KNeighborsClassifier(n_neighbors=3, metric='chebyshev'),
        "MahalanobisKNN": KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix}),
        "SeuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': variance_vector}),
        "WminkowskiKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3, metric_params={'w': np.ones(X_train.shape[1])}),
    }
    return knn_variants

In [None]:
# Metrics calculation
def get_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    num_classes = cm.shape[0]
    # Multiclass metrics
    TP = np.diag(cm)
    FP = np.sum(cm, axis=0) - TP
    FN = np.sum(cm, axis=1) - TP
    TN = np.sum(cm) - (FP + FN + TP)
    specificity = np.mean([TN[i] / (TN[i] + FP[i]) if (TN[i] + FP[i]) > 0 else 0 for i in range(num_classes)])
    sensitivity = np.mean([TP[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)])
    gmean = np.sqrt(specificity * sensitivity)
    type1 = np.mean([FP[i] / (FP[i] + TN[i]) if (FP[i] + TN[i]) > 0 else 0 for i in range(num_classes)])
    type2 = np.mean([FN[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)])
    fmeasure = f1_score(y_true, y_pred, average='macro')
    auc = 0
    if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
        try:
            auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
        except Exception:
            auc = 0
    return {
        "accuracy": accuracy,
        "sensitivity": sensitivity,
        "specificity": specificity,
        "gmean": gmean,
        "type1 error": type1,
        "type2 error": type2,
        "f1": fmeasure,
        "auroc": auc,
    }

In [None]:
# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X_resampled, y_resampled)):
    X_train_cv, X_test_cv = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
    y_train_cv, y_test_cv = y_resampled.iloc[train_idx], y_resampled.iloc[test_idx]
    knn_variants = get_knn_variants(X_train_cv)
    for name, model in knn_variants.items():
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_test_cv)
        y_prob = model.predict_proba(X_test_cv) if hasattr(model, "predict_proba") else None
        metrics = get_metrics(y_test_cv, y_pred, y_prob)
        metrics["fold"] = fold+1
        metrics["variant"] = name
        results.append(metrics)
        print(f"Fold {fold+1} - {name}: " +
              ", ".join([f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}" for k,v in metrics.items() if k not in ["fold","variant"]]))


Fold 1 - KNN: accuracy: 0.9855, sensitivity: 0.9855, specificity: 0.9995, gmean: 0.9925, type1 error: 0.0005, type2 error: 0.0145, f1: 0.9848, auroc: 0.9956
Fold 1 - DistanceKNN: accuracy: 0.9871, sensitivity: 0.9871, specificity: 0.9995, gmean: 0.9933, type1 error: 0.0005, type2 error: 0.0129, f1: 0.9864, auroc: 0.9956


In [None]:
# Results DataFrame
results_df = pd.DataFrame(results)
mean_results = results_df.groupby("variant").mean().sort_values("auroc", ascending=False)
print("\nMean metrics across folds:\n", mean_results)