<a href="https://colab.research.google.com/github/muajnstu/Comparative-Analysis-of-KNN-Variants-for-Diabetes-Prediction-Using-Administrative-Health-data/blob/main/cross_validation_primary_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from imblearn.over_sampling import SMOTE

In [None]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/Comparative-Analysis-of-K-Nearest-Neighbors-Variants-for-Diabetes-Prediction-Using-Administrative-He/refs/heads/main/raw%20data.csv')
X = df.drop(columns=['Outcome'])
y = df['Outcome']


In [None]:
# KNN variant definitions
def get_knn_variants(X_train):
    covariance_matrix = np.cov(X_train.T)
    stabilized_covariance_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
    inv_covariance_matrix = np.linalg.inv(stabilized_covariance_matrix)
    variance_vector = np.var(X_train, axis=0)
    knn_variants = {
        "KNN": KNeighborsClassifier(n_neighbors=3),
        "DistanceKNN": KNeighborsClassifier(n_neighbors=3, weights='distance'),
        "GeneralizedKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3),
        "EuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
        "ManhattanKNN": KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
        "ChebyshevKNN": KNeighborsClassifier(n_neighbors=3, metric='chebyshev'),
        "MahalanobisKNN": KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix}),
        "SeuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': variance_vector}),
        "WminkowskiKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3, metric_params={'w': np.ones(X_train.shape[1])}),
    }
    return knn_variants

In [None]:
def print_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    num_classes = cm.shape[0]

    if num_classes == 2:
        # Binary classification
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(specificity * sensitivity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
        fmeasure = f1_score(y_true, y_pred, pos_label=1)
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob[:, 1])
            except Exception:
                auc = 0

    else:
        # Multiclass classification
        TP = np.diag(cm)
        FP = np.sum(cm, axis=0) - TP
        FN = np.sum(cm, axis=1) - TP
        TN = np.sum(cm) - (FP + FN + TP)

        specificity = np.mean([
            TN[i] / (TN[i] + FP[i]) if (TN[i] + FP[i]) > 0 else 0 for i in range(num_classes)
        ])
        sensitivity = np.mean([
            TP[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        gmean = np.sqrt(specificity * sensitivity)
        type1 = np.mean([
            FP[i] / (FP[i] + TN[i]) if (FP[i] + TN[i]) > 0 else 0 for i in range(num_classes)
        ])
        type2 = np.mean([
            FN[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        fmeasure = f1_score(y_true, y_pred, average='macro')

        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
            except Exception:
                auc = 0

    metrics = {
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "G-Mean": gmean,
        "Type I Error": type1,
        "Type II Error": type2,
        "F1 Score": fmeasure,
        "AUROC": auc
    }

    return metrics

In [None]:
all_results = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
knn_variants = get_knn_variants(X)

for name, model_proto in knn_variants.items():
    model_results = []
    print(f"\n==== {name} ====")

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
        y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]

        # Handle special metric parameters
        if name == "MahalanobisKNN":
            covariance_matrix = np.cov(X_train_cv.T)
            stabilized_cov_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
            inv_cov_matrix = np.linalg.inv(stabilized_cov_matrix)
            model = KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_cov_matrix})
        elif name == "SeuclideanKNN":
            variance_vector = np.var(X_train_cv, axis=0)
            model = KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': variance_vector})
        elif name == "WminkowskiKNN":
            model = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3, metric_params={'w': np.ones(X_train_cv.shape[1])})
        else:
            model = KNeighborsClassifier(**model_proto.get_params())

        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_test_cv)
        y_prob = model.predict_proba(X_test_cv) if hasattr(model, "predict_proba") else None

        metrics = print_metrics(y_test_cv, y_pred, y_prob)
        metrics["fold"] = fold + 1
        metrics["variant"] = name
        model_results.append(metrics)

        #  Only one fold-wise print
        print(f"Fold {fold+1}: " +
              ", ".join([f"{k}: {v:.4f}" for k, v in metrics.items() if k not in ["fold", "variant"]]))

    all_results.extend(model_results)



==== KNN ====
Fold 1: Accuracy: 0.7920, Sensitivity: 0.1256, Specificity: 0.9158, G-Mean: 0.3391, Type I Error: 0.0842, Type II Error: 0.8744, F1 Score: 0.1591, AUROC: 0.5425
Fold 2: Accuracy: 0.7864, Sensitivity: 0.1659, Specificity: 0.9017, G-Mean: 0.3868, Type I Error: 0.0983, Type II Error: 0.8341, F1 Score: 0.1958, AUROC: 0.5981
Fold 3: Accuracy: 0.7848, Sensitivity: 0.1937, Specificity: 0.8942, G-Mean: 0.4162, Type I Error: 0.1058, Type II Error: 0.8063, F1 Score: 0.2194, AUROC: 0.5795
Fold 4: Accuracy: 0.7996, Sensitivity: 0.1441, Specificity: 0.9208, G-Mean: 0.3643, Type I Error: 0.0792, Type II Error: 0.8559, F1 Score: 0.1834, AUROC: 0.5735
Fold 5: Accuracy: 0.7996, Sensitivity: 0.1982, Specificity: 0.9108, G-Mean: 0.4249, Type I Error: 0.0892, Type II Error: 0.8018, F1 Score: 0.2359, AUROC: 0.5868
Fold 6: Accuracy: 0.8031, Sensitivity: 0.1351, Specificity: 0.9267, G-Mean: 0.3539, Type I Error: 0.0733, Type II Error: 0.8649, F1 Score: 0.1765, AUROC: 0.5525
Fold 7: Accuracy: 0