<a href="https://colab.research.google.com/github/muajnstu/Comparative-Analysis-of-K-Nearest-Neighbors-Variants-for-Diabetes-Prediction-Using-Administrative-He/blob/main/diabetics_prediction_with_z_transformed_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetics Prediction Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score,
    f1_score
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans

# --- Custom Hybrid KNN+SVM (with memberships) ---
class KNNSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, plot=True):
        self.k = k
        self.plot = plot

    def fit(self, X, y=None):
        self.neigh = neighbors.NearestNeighbors(n_neighbors=14)
        self.neigh.fit(X, y)
        self._check_params(X, y)
        self.X = X
        self.y = y
        self.xdim = len(self.X[0])
        self.n = len(y)
        self.classes = [0, 1]
        self.df = pd.DataFrame(self.X)
        self.df['y'] = self.y.values # Use .values to get numpy array for alignment
        self.memberships = self._compute_memberships()
        self.df['membership'] = self.memberships
        self.result = self.neigh.kneighbors(X)
        self.label_index = self.result[1]
        self.label = []
        self.train = []
        for i in self.label_index:
            for j in i:
                one_label = y.iloc[j] # Access using integer position
                one_train = X[j]
                self.label.append(one_label)
                self.train.append(one_train)
        self.np_label = np.array(self.label)
        self.np_train = np.array(self.train)
        self.clf = LinearSVC()
        self.clf.fit(self.np_train, self.np_label)
        self.fitted_ = True
        return self

    def predict(self, r):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict() called before fit()')
        if len(set(self.label)) == 1:
            return self.label
        return self.clf.predict(r)

    def predict_proba(self, X):
        # LinearSVC does not have predict_proba; use decision_function instead
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict_proba() called before fit()')
        if hasattr(self.clf, "decision_function"):
            decision = self.clf.decision_function(X)
            # Normalize to (0,1) range for ROC AUC, shape (n_samples, 2)
            if decision.ndim == 1:
                min_val, max_val = decision.min(), decision.max()
                if min_val == max_val:
                    probs = np.ones((len(decision), 2)) * 0.5
                else:
                    probs = np.zeros((len(decision), 2))
                    probs[:, 1] = (decision - min_val) / (max_val - min_val)
                    probs[:, 0] = 1 - probs[:, 1]
                return probs
            else:
                # Multiclass
                exp_decision = np.exp(decision)
                probs = exp_decision / exp_decision.sum(axis=1, keepdims=True)
                return probs
        else:
            n = X.shape[0]
            return np.ones((n, 2)) * 0.5

    def score(self, X, y):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('score() called before fit()')
        predictions = self.predict(X)
        predictions = np.asarray(predictions)
        return accuracy_score(y_pred=predictions, y_true=y)

    def _find_k_nearest_neighbors(self, df, x):
        X = df.iloc[:, 0:self.xdim].values
        df['distances'] = [np.linalg.norm(X[i] - x) for i in range(self.n)]
        df.sort_values(by='distances', ascending=True, inplace=True)
        neighbors = df.iloc[0:self.k]
        return neighbors

    def _get_counts(self, neighbors):
        groups = neighbors.groupby('y')
        counts = {group[1]['y'].iloc[0]: group[1].count()[0] for group in groups}
        return counts

    def _compute_memberships(self):
        memberships = []
        for i in range(self.n):
            x = self.X[i]
            y = self.y.iloc[i] # Access using integer position
            neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
            counts = self._get_counts(neighbors)
            membership = dict()
            for c in self.classes:
                uci = 0.49 * (counts.get(c, 0) / self.k)
                if c == y:
                    uci += 0.51
                membership[c] = uci
            memberships.append(membership)
        return memberships

    def _check_params(self, X, y):
        if type(self.k) != int:
            raise Exception('"k" should have type int')
        elif self.k >= len(y):
            raise Exception('"k" should be less than no of feature sets')
        elif self.k % 2 == 0:
            raise Exception('"k" should be odd')
        if type(self.plot) != bool:
            raise Exception('"plot" should have type bool')


# --- Custom Hybrid KNN+Bayes (with memberships) ---
class KNNBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, plot=True):
        self.k = k
        self.plot = plot

    def fit(self, X, y=None):
        self.neigh = neighbors.NearestNeighbors(n_neighbors=14)
        self.neigh.fit(X, y)
        self._check_params(X, y)
        self.X = X
        self.y = y
        self.xdim = len(self.X[0])
        self.n = len(y)
        self.classes = [0, 1]
        self.df = pd.DataFrame(self.X)
        self.df['y'] = self.y.values # Use .values to get numpy array for alignment
        self.memberships = self._compute_memberships()
        self.df['membership'] = self.memberships
        self.result = self.neigh.kneighbors(X)
        self.label_index = self.result[1]
        self.label = []
        self.train = []
        for i in self.label_index:
            for j in i:
                one_label = y.iloc[j] # Access using integer position
                one_train = X[j]
                self.label.append(one_label)
                self.train.append(one_train)
        self.np_label = np.array(self.label)
        self.np_train = np.array(self.train)
        self.clf = GaussianNB()
        self.clf.fit(self.np_train, self.np_label)
        self.fitted_ = True
        return self

    def predict(self, r):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict() called before fit()')
        if len(set(self.label)) == 1:
            return self.label
        return self.clf.predict(r)

    def predict_proba(self, X):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict_proba() called before fit()')
        return self.clf.predict_proba(X)

    def score(self, X, y):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('score() called before fit()')
        predictions = self.predict(X)
        predictions = np.asarray(predictions)
        return accuracy_score(y_pred=predictions, y_true=y)

    def _find_k_nearest_neighbors(self, df, x):
        X = df.iloc[:, 0:self.xdim].values
        df['distances'] = [np.linalg.norm(X[i] - x) for i in range(self.n)]
        df.sort_values(by='distances', ascending=True, inplace=True)
        neighbors = df.iloc[0:self.k]
        return neighbors

    def _get_counts(self, neighbors):
        groups = neighbors.groupby('y')
        counts = {group[1]['y'].iloc[0]: group[1].count()[0] for group in groups}
        return counts

    def _compute_memberships(self):
        memberships = []
        for i in range(self.n):
            x = self.X[i]
            y = self.y.iloc[i] # Access using integer position
            neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
            counts = self._get_counts(neighbors)
            membership = dict()
            for c in self.classes:
                uci = 0.49 * (counts.get(c, 0) / self.k)
                if c == y:
                    uci += 0.51
                membership[c] = uci
            memberships.append(membership)
        return memberships

    def _check_params(self, X, y):
        if type(self.k) != int:
            raise Exception('"k" should have type int')
        elif self.k >= len(y):
            raise Exception('"k" should be less than no of feature sets')
        elif self.k % 2 == 0:
            raise Exception('"k" should be odd')
        if type(self.plot) != bool:
            raise Exception('"plot" should have type bool')


class KmeansKNN():
    def __init__(self, n_neighbors=3, output='add', n_jobs=None, random_state=0):
        self.output = output
        self._random_state = random_state
        self._cluster = None
        self._kclass = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=n_jobs)

    def fit(self, X_train, y_train):
        if type(X_train) != np.ndarray:
            X_train = X_train.values
        self._cluster = KMeans(n_clusters=len(np.unique(y_train)), random_state=self._random_state).fit(X_train)
        y_labels_train = self._cluster.labels_
        if self.output == 'add':
            X_train = np.append(X_train, np.reshape(y_labels_train, (-1, 1)), axis=1)
        elif self.output == 'replace':
            X_train = y_labels_train[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        self._kclass.fit(X_train, y_train)

    def predict(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.values
        y_labels_test = self._cluster.predict(X_test)
        if self.output == 'add':
            X_test = np.append(X_test, np.reshape(y_labels_test, (-1, 1)), axis=1)
        elif self.output == 'replace':
            X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self._kclass.predict(X_test)

    def predict_proba(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.values
        y_labels_test = self._cluster.predict(X_test)
        if self.output == 'add':
            X_test = np.append(X_test, np.reshape(y_labels_test, (-1, 1)), axis=1)
        elif self.output == 'replace':
            X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self._kclass.predict_proba(X_test)

# --- Evaluation Metrics ---
def print_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(sensitivity * specificity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
    else:
        specificity = sensitivity = gmean = type1 = type2 = np.nan
    accuracy = accuracy_score(y_true, y_pred)
    fmeasure = f1_score(y_true, y_pred, pos_label=1)
    auc = None
    if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
        try:
            auc = roc_auc_score(y_true, y_prob[:, 1])
        except Exception:
            auc = 0
    else:
        auc = 0
    print(f"Accuracy   : {accuracy:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}")
    print(f"G-Mean     : {gmean:.4f}")
    print(f"F-measure  : {fmeasure:.4f}")
    print(f"AUC        : {auc:.4f}")
    print(f"Type-1 error (FPR): {type1:.4f}")
    print(f"Type-2 error (FNR): {type2:.4f}")
    print("Confusion Matrix:\n", cm)

# --- Data Loading ---
data_url = "https://raw.githubusercontent.com/muajnstu/Comparative-Analysis-of-K-Nearest-Neighbors-Variants-for-Diabetes-Prediction-Using-Administrative-He/refs/heads/main/update_dataframe%20(1).csv"
df = pd.read_csv(data_url)
X = df.drop(columns=['Outcome'])
y = df['Outcome']
print("Class distribution:\n", y.value_counts())

# --- Handle Imbalanced Data ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Balanced class distribution:\n", pd.Series(y_resampled).value_counts())

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=46, stratify=y_resampled)

# --- Feature Scaling ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- KNN Variant Models ---
def run_knn_variant(name, knn_clf):
    print(f"\n==== {name} ====")
    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)
    if hasattr(knn_clf, "predict_proba"):
        try:
            y_prob = knn_clf.predict_proba(X_test)
        except Exception:
            y_prob = None
    else:
        y_prob = None
    print_metrics(y_test, y_pred, y_prob)

covariance_matrix = np.cov(X_train.T)
stabilized_covariance_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
inv_covariance_matrix = np.linalg.inv(stabilized_covariance_matrix)

knn_variants = {
    "EuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
    "ManhattanKNN": KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
    "ChebyshevKNN": KNeighborsClassifier(n_neighbors=3, metric='chebyshev'),
    "MahalanobisKNN": KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix}),
    "SeuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': np.var(X_train, axis=0)}),
    "WminkowskiKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3, metric_params={'w': np.ones(X_train.shape[1])}),
    "DistanceKNN": KNeighborsClassifier(n_neighbors=3, weights='distance'),
    "GeneralizedKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "KNNBayes": KNNBayes(k=3, plot=False),
    "KNNSVM": KNNSVM(k=3, plot=False),
}

for name, model in knn_variants.items():
    run_knn_variant(name, model)

print("\n==== KMeansKNN ====")
kmeansknn = KmeansKNN(n_neighbors=3, output='add')
kmeansknn.fit(X_train, y_train)
y_pred = kmeansknn.predict(X_test)
y_prob = kmeansknn.predict_proba(X_test)
print_metrics(y_test, y_pred, y_prob)

print("\nProject complete! Check the above output for performance of each KNN variant.")

Class distribution:
 Outcome
0    5405
1    1231
Name: count, dtype: int64
Balanced class distribution:
 Outcome
1    5405
0    5405
Name: count, dtype: int64

==== EuclideanKNN ====
Accuracy   : 0.7438
Specificity: 0.6466
Sensitivity: 0.8409
G-Mean     : 0.7374
F-measure  : 0.7664
AUC        : 0.7973
Type-1 error (FPR): 0.3534
Type-2 error (FNR): 0.1591
Confusion Matrix:
 [[699 382]
 [172 909]]

==== ManhattanKNN ====
Accuracy   : 0.7414
Specificity: 0.6392
Sensitivity: 0.8437
G-Mean     : 0.7344
F-measure  : 0.7654
AUC        : 0.7952
Type-1 error (FPR): 0.3608
Type-2 error (FNR): 0.1563
Confusion Matrix:
 [[691 390]
 [169 912]]

==== ChebyshevKNN ====
Accuracy   : 0.7155
Specificity: 0.6244
Sensitivity: 0.8067
G-Mean     : 0.7097
F-measure  : 0.7393
AUC        : 0.7736
Type-1 error (FPR): 0.3756
Type-2 error (FNR): 0.1933
Confusion Matrix:
 [[675 406]
 [209 872]]

==== MahalanobisKNN ====
Accuracy   : 0.7502
Specificity: 0.6512
Sensitivity: 0.8492
G-Mean     : 0.7437
F-measure  : 0.

# Cross Validation

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score,
    f1_score
)
from sklearn.neighbors import KNeighborsClassifier

# --- Wrapper for KmeansKNN for scikit-learn compatibility ---
class KmeansKNNWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors=5, output='add', random_state=0):
        self.n_neighbors = n_neighbors
        self.output = output
        self.random_state = random_state
    def fit(self, X, y):
        self.model = KmeansKNN(n_neighbors=self.n_neighbors, output=self.output, random_state=self.random_state)
        self.model.fit(X, y)
        return self
    def predict(self, X):
        return self.model.predict(X)
    def predict_proba(self, X):
        return self.model.predict_proba(X)

def fold_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(sensitivity * specificity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
    else:
        specificity = sensitivity = gmean = type1 = type2 = np.nan
    accuracy = accuracy_score(y_true, y_pred)
    fmeasure = f1_score(y_true, y_pred, pos_label=1)
    auc = 0
    if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
        try:
            auc = roc_auc_score(y_true, y_prob[:, 1])
        except Exception:
            auc = 0
    return {
        "accuracy": accuracy,
        "specificity": specificity,
        "sensitivity": sensitivity,
        "gmean": gmean,
        "f1_score": fmeasure,
        "auc": auc,
        "type1_error": type1,
        "type2_error": type2
    }

def crossval_foldwise_results(models, X, y, n_splits=10, save_path='knn_crossval_results.csv'):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    for model_name, model in models.items():
        print(f'\n==== {model_name} ====')
        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            clf = model
            try:
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                if hasattr(clf, "predict_proba"):
                    try:
                        y_prob = clf.predict_proba(X_test)
                    except Exception:
                        y_prob = None
                else:
                    y_prob = None
                metrics = fold_metrics(y_test, y_pred, y_prob)
                metrics.update({
                    "model": model_name,
                    "fold": fold,
                })
                results.append(metrics)
                print(f"Fold {fold} : acc={metrics['accuracy']:.4f}, spec={metrics['specificity']:.4f}, sens={metrics['sensitivity']:.4f}, gmean={metrics['gmean']:.4f}, f1={metrics['f1_score']:.4f}, auc={metrics['auc']:.4f}, t1e={metrics['type1_error']:.4f}, t2e={metrics['type2_error']:.4f}")
            except Exception as e:
                print(f"Error in {model_name} fold {fold}: {e}")
    df = pd.DataFrame(results)
    df.to_csv(save_path, index=False)
    print(f"\nAll fold results saved to {save_path}")
    return df

# --- Prepare data for CV (fit scaler on all resampled data for fair comparison) ---
scaler_cv = StandardScaler()
X_res_cv = scaler_cv.fit_transform(X_resampled)
y_res_cv = np.asarray(y_resampled)

# --- Covariance for Mahalanobis ---
covariance_matrix_cv = np.cov(X_res_cv.T)
stabilized_covariance_matrix_cv = covariance_matrix_cv + np.eye(covariance_matrix_cv.shape[0]) * 1e-6
inv_covariance_matrix_cv = np.linalg.inv(stabilized_covariance_matrix_cv)

# --- Cross-validation models ---
crossval_models = {
    "EuclideanKNN": KNeighborsClassifier(n_neighbors=5, metric='euclidean'),
    "ManhattanKNN": KNeighborsClassifier(n_neighbors=5, metric='manhattan'),
    "ChebyshevKNN": KNeighborsClassifier(n_neighbors=5, metric='chebyshev'),
    "MahalanobisKNN": KNeighborsClassifier(n_neighbors=5, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix_cv}),
    "SeuclideanKNN": KNeighborsClassifier(n_neighbors=5, metric='seuclidean', metric_params={'V': np.var(X_res_cv, axis=0)}),
    "WminkowskiKNN": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=3, metric_params={'w': np.ones(X_res_cv.shape[1])}),
    "DistanceKNN": KNeighborsClassifier(n_neighbors=5, weights='distance'),
    "GeneralizedKNN": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=3),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "KNNBayes": KNNBayes(k=5, plot=False),
    "KNNSVM": KNNSVM(k=5, plot=False),
    "KMeansKNN": KmeansKNNWrapper(n_neighbors=5, output='add', random_state=0)
}

# --- Run and save results ---
crossval_foldwise_results(crossval_models, X_res_cv, y_res_cv, n_splits=10, save_path='knn_crossval_results.csv')


==== EuclideanKNN ====
Fold 1 : acc=0.7419, spec=0.6519, sens=0.8318, gmean=0.7363, f1=0.7634, auc=0.8077, t1e=0.3481, t2e=0.1682
Fold 2 : acc=0.7317, spec=0.6370, sens=0.8262, gmean=0.7255, f1=0.7551, auc=0.7875, t1e=0.3630, t2e=0.1738
Fold 3 : acc=0.7465, spec=0.6481, sens=0.8447, gmean=0.7399, f1=0.7694, auc=0.8222, t1e=0.3519, t2e=0.1553
Fold 4 : acc=0.7373, spec=0.6593, sens=0.8152, gmean=0.7331, f1=0.7564, auc=0.7996, t1e=0.3407, t2e=0.1848
Fold 5 : acc=0.7299, spec=0.6204, sens=0.8392, gmean=0.7215, f1=0.7567, auc=0.7917, t1e=0.3796, t2e=0.1608
Fold 6 : acc=0.7308, spec=0.6470, sens=0.8148, gmean=0.7260, f1=0.7515, auc=0.7882, t1e=0.3530, t2e=0.1852
Fold 7 : acc=0.7558, spec=0.6691, sens=0.8426, gmean=0.7509, f1=0.7751, auc=0.7998, t1e=0.3309, t2e=0.1574
Fold 8 : acc=0.7243, spec=0.6506, sens=0.7981, gmean=0.7206, f1=0.7431, auc=0.7937, t1e=0.3494, t2e=0.2019
Fold 9 : acc=0.7345, spec=0.6118, sens=0.8574, gmean=0.7243, f1=0.7634, auc=0.7799, t1e=0.3882, t2e=0.1426
Fold 10 : acc

  return self._fit(X, y)


Fold 2 : acc=0.7308, spec=0.6296, sens=0.8318, gmean=0.7237, f1=0.7557, auc=0.7857, t1e=0.3704, t2e=0.1682


  return self._fit(X, y)


Fold 3 : acc=0.7447, spec=0.6463, sens=0.8429, gmean=0.7381, f1=0.7677, auc=0.8181, t1e=0.3537, t2e=0.1571


  return self._fit(X, y)


Fold 4 : acc=0.7345, spec=0.6537, sens=0.8152, gmean=0.7300, f1=0.7545, auc=0.7951, t1e=0.3463, t2e=0.1848


  return self._fit(X, y)


Fold 5 : acc=0.7253, spec=0.6111, sens=0.8392, gmean=0.7161, f1=0.7535, auc=0.7883, t1e=0.3889, t2e=0.1608


  return self._fit(X, y)


Fold 6 : acc=0.7188, spec=0.6359, sens=0.8019, gmean=0.7140, f1=0.7402, auc=0.7793, t1e=0.3641, t2e=0.1981


  return self._fit(X, y)


Fold 7 : acc=0.7530, spec=0.6599, sens=0.8463, gmean=0.7473, f1=0.7739, auc=0.7949, t1e=0.3401, t2e=0.1537


  return self._fit(X, y)


Fold 8 : acc=0.7225, spec=0.6470, sens=0.7981, gmean=0.7186, f1=0.7418, auc=0.7919, t1e=0.3530, t2e=0.2019


  return self._fit(X, y)


Fold 9 : acc=0.7290, spec=0.6211, sens=0.8370, gmean=0.7210, f1=0.7552, auc=0.7791, t1e=0.3789, t2e=0.1630


  return self._fit(X, y)


Fold 10 : acc=0.7234, spec=0.6266, sens=0.8204, gmean=0.7170, f1=0.7477, auc=0.7824, t1e=0.3734, t2e=0.1796

==== DistanceKNN ====
Fold 1 : acc=0.7539, spec=0.6500, sens=0.8577, gmean=0.7466, f1=0.7772, auc=0.8121, t1e=0.3500, t2e=0.1423
Fold 2 : acc=0.7456, spec=0.6352, sens=0.8558, gmean=0.7373, f1=0.7710, auc=0.7949, t1e=0.3648, t2e=0.1442
Fold 3 : acc=0.7521, spec=0.6444, sens=0.8595, gmean=0.7443, f1=0.7763, auc=0.8238, t1e=0.3556, t2e=0.1405
Fold 4 : acc=0.7475, spec=0.6537, sens=0.8410, gmean=0.7415, f1=0.7692, auc=0.8026, t1e=0.3463, t2e=0.1590
Fold 5 : acc=0.7280, spec=0.6167, sens=0.8392, gmean=0.7194, f1=0.7554, auc=0.7942, t1e=0.3833, t2e=0.1608
Fold 6 : acc=0.7428, spec=0.6414, sens=0.8444, gmean=0.7360, f1=0.7664, auc=0.7985, t1e=0.3586, t2e=0.1556
Fold 7 : acc=0.7576, spec=0.6691, sens=0.8463, gmean=0.7525, f1=0.7772, auc=0.8007, t1e=0.3309, t2e=0.1537
Fold 8 : acc=0.7382, spec=0.6580, sens=0.8185, gmean=0.7339, f1=0.7575, auc=0.7986, t1e=0.3420, t2e=0.1815
Fold 9 : acc=

Unnamed: 0,accuracy,specificity,sensitivity,gmean,f1_score,auc,type1_error,type2_error,model,fold
0,0.741906,0.651852,0.831793,0.736346,0.763359,0.807676,0.348148,0.168207,EuclideanKNN,1
1,0.731730,0.637037,0.826248,0.725500,0.755068,0.787530,0.362963,0.173752,EuclideanKNN,2
2,0.746531,0.648148,0.844732,0.739940,0.769360,0.822226,0.351852,0.155268,EuclideanKNN,3
3,0.737280,0.659259,0.815157,0.733076,0.756432,0.799601,0.340741,0.184843,EuclideanKNN,4
4,0.729880,0.620370,0.839187,0.721531,0.756667,0.791739,0.379630,0.160813,EuclideanKNN,5
...,...,...,...,...,...,...,...,...,...,...
95,0.725254,0.639556,0.811111,0.720244,0.746803,0.783525,0.360444,0.188889,KMeansKNN,6
96,0.755782,0.669131,0.842593,0.750870,0.775128,0.799810,0.330869,0.157407,KMeansKNN,7
97,0.724329,0.650647,0.798148,0.720634,0.743103,0.793698,0.349353,0.201852,KMeansKNN,8
98,0.734505,0.611830,0.857407,0.724284,0.763397,0.779881,0.388170,0.142593,KMeansKNN,9


## Cross Validation of hybrid model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score,
    f1_score
)

# --- Fixed KNNSVM ---
class KNNSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, plot=True):
        self.k = k
        self.plot = plot

    def fit(self, X, y=None):
        from sklearn import neighbors
        from sklearn.svm import LinearSVC
        self.neigh = neighbors.NearestNeighbors(n_neighbors=14)
        self.neigh.fit(X, y)
        self._check_params(X, y)
        self.X = X
        self.y = y
        self.xdim = len(self.X[0])
        self.n = len(y)
        self.classes = [0, 1]
        self.df = pd.DataFrame(self.X)
        self.df['y'] = np.array(self.y)  # FIX: supports numpy and pandas
        self.memberships = self._compute_memberships()
        self.df['membership'] = self.memberships
        self.result = self.neigh.kneighbors(X)
        self.label_index = self.result[1]
        self.label = []
        self.train = []
        for i in self.label_index:
            for j in i:
                one_label = np.array(y)[j]  # FIX: supports numpy and pandas
                one_train = X[j]
                self.label.append(one_label)
                self.train.append(one_train)
        self.np_label = np.array(self.label)
        self.np_train = np.array(self.train)
        self.clf = LinearSVC()
        self.clf.fit(self.np_train, self.np_label)
        self.fitted_ = True
        return self

    def predict(self, r):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict() called before fit()')
        if len(set(self.label)) == 1:
            return self.label
        return self.clf.predict(r)

    def predict_proba(self, X):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict_proba() called before fit()')
        if hasattr(self.clf, "decision_function"):
            decision = self.clf.decision_function(X)
            if decision.ndim == 1:
                min_val, max_val = decision.min(), decision.max()
                if min_val == max_val:
                    probs = np.ones((len(decision), 2)) * 0.5
                else:
                    probs = np.zeros((len(decision), 2))
                    probs[:, 1] = (decision - min_val) / (max_val - min_val)
                    probs[:, 0] = 1 - probs[:, 1]
                return probs
            else:
                exp_decision = np.exp(decision)
                probs = exp_decision / exp_decision.sum(axis=1, keepdims=True)
                return probs
        else:
            n = X.shape[0]
            return np.ones((n, 2)) * 0.5

    def score(self, X, y):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('score() called before fit()')
        predictions = self.predict(X)
        predictions = np.asarray(predictions)
        return accuracy_score(y_pred=predictions, y_true=y)

    def _find_k_nearest_neighbors(self, df, x):
        X = df.iloc[:, 0:self.xdim].values
        df['distances'] = [np.linalg.norm(X[i] - x) for i in range(self.n)]
        df.sort_values(by='distances', ascending=True, inplace=True)
        neighbors = df.iloc[0:self.k]
        return neighbors

    def _get_counts(self, neighbors):
        groups = neighbors.groupby('y')
        counts = {group[1]['y'].iloc[0]: group[1].count()[0] for group in groups}
        return counts

    def _compute_memberships(self):
        memberships = []
        for i in range(self.n):
            x = self.X[i]
            y = np.array(self.y)[i]  # FIX: supports numpy and pandas
            neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
            counts = self._get_counts(neighbors)
            membership = dict()
            for c in self.classes:
                uci = 0.49 * (counts.get(c, 0) / self.k)
                if c == y:
                    uci += 0.51
                membership[c] = uci
            memberships.append(membership)
        return memberships

    def _check_params(self, X, y):
        if type(self.k) != int:
            raise Exception('"k" should have type int')
        elif self.k >= len(y):
            raise Exception('"k" should be less than no of feature sets')
        elif self.k % 2 == 0:
            raise Exception('"k" should be odd')
        if type(self.plot) != bool:
            raise Exception('"plot" should have type bool')

# --- Fixed KNNBayes ---
class KNNBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, plot=True):
        self.k = k
        self.plot = plot

    def fit(self, X, y=None):
        from sklearn import neighbors
        from sklearn.naive_bayes import GaussianNB
        self.neigh = neighbors.NearestNeighbors(n_neighbors=14)
        self.neigh.fit(X, y)
        self._check_params(X, y)
        self.X = X
        self.y = y
        self.xdim = len(self.X[0])
        self.n = len(y)
        self.classes = [0, 1]
        self.df = pd.DataFrame(self.X)
        self.df['y'] = np.array(self.y)  # FIX: supports numpy and pandas
        self.memberships = self._compute_memberships()
        self.df['membership'] = self.memberships
        self.result = self.neigh.kneighbors(X)
        self.label_index = self.result[1]
        self.label = []
        self.train = []
        for i in self.label_index:
            for j in i:
                one_label = np.array(y)[j]  # FIX: supports numpy and pandas
                one_train = X[j]
                self.label.append(one_label)
                self.train.append(one_train)
        self.np_label = np.array(self.label)
        self.np_train = np.array(self.train)
        self.clf = GaussianNB()
        self.clf.fit(self.np_train, self.np_label)
        self.fitted_ = True
        return self

    def predict(self, r):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict() called before fit()')
        if len(set(self.label)) == 1:
            return self.label
        return self.clf.predict(r)

    def predict_proba(self, X):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('predict_proba() called before fit()')
        return self.clf.predict_proba(X)

    def score(self, X, y):
        if not hasattr(self, "fitted_") or not self.fitted_:
            raise Exception('score() called before fit()')
        predictions = self.predict(X)
        predictions = np.asarray(predictions)
        return accuracy_score(y_pred=predictions, y_true=y)

    def _find_k_nearest_neighbors(self, df, x):
        X = df.iloc[:, 0:self.xdim].values
        df['distances'] = [np.linalg.norm(X[i] - x) for i in range(self.n)]
        df.sort_values(by='distances', ascending=True, inplace=True)
        neighbors = df.iloc[0:self.k]
        return neighbors

    def _get_counts(self, neighbors):
        groups = neighbors.groupby('y')
        counts = {group[1]['y'].iloc[0]: group[1].count()[0] for group in groups}
        return counts

    def _compute_memberships(self):
        memberships = []
        for i in range(self.n):
            x = self.X[i]
            y = np.array(self.y)[i]  # FIX: supports numpy and pandas
            neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
            counts = self._get_counts(neighbors)
            membership = dict()
            for c in self.classes:
                uci = 0.49 * (counts.get(c, 0) / self.k)
                if c == y:
                    uci += 0.51
                membership[c] = uci
            memberships.append(membership)
        return memberships

    def _check_params(self, X, y):
        if type(self.k) != int:
            raise Exception('"k" should have type int')
        elif self.k >= len(y):
            raise Exception('"k" should be less than no of feature sets')
        elif self.k % 2 == 0:
            raise Exception('"k" should be odd')
        if type(self.plot) != bool:
            raise Exception('"plot" should have type bool')

# --- Helper for metrics ---
def fold_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(sensitivity * specificity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
    else:
        specificity = sensitivity = gmean = type1 = type2 = np.nan
    accuracy = accuracy_score(y_true, y_pred)
    fmeasure = f1_score(y_true, y_pred, pos_label=1)
    auc = 0
    if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
        try:
            auc = roc_auc_score(y_true, y_prob[:, 1])
        except Exception:
            auc = 0
    return {
        "accuracy": accuracy,
        "specificity": specificity,
        "sensitivity": sensitivity,
        "gmean": gmean,
        "f1_score": fmeasure,
        "auc": auc,
        "type1_error": type1,
        "type2_error": type2
    }

# --- 10-fold CV for KNNBayes and KNNSVM, print and save to CSV ---
def crossval_foldwise_results(models, X, y, n_splits=10, save_path='knnbayes_knnsvm_crossval.csv'):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    for model_name, model_class in models.items():
        print(f'\n==== {model_name} ====')
        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            clf = model_class
            try:
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                if hasattr(clf, "predict_proba"):
                    try:
                        y_prob = clf.predict_proba(X_test)
                    except Exception:
                        y_prob = None
                else:
                    y_prob = None
                metrics = fold_metrics(y_test, y_pred, y_prob)
                metrics.update({
                    "model": model_name,
                    "fold": fold,
                })
                results.append(metrics)
                print(f"Fold {fold} : acc={metrics['accuracy']:.4f}, spec={metrics['specificity']:.4f}, sens={metrics['sensitivity']:.4f}, gmean={metrics['gmean']:.4f}, f1={metrics['f1_score']:.4f}, auc={metrics['auc']:.4f}, t1e={metrics['type1_error']:.4f}, t2e={metrics['type2_error']:.4f}")
            except Exception as e:
                print(f"Error in {model_name} fold {fold}: {e}")
    df = pd.DataFrame(results)
    df.to_csv(save_path, index=False)
    print(f"\nAll fold results saved to {save_path}")
    return df



In [None]:
scaler_cv = StandardScaler()
X_res_cv = scaler_cv.fit_transform(X_resampled)
y_res_cv = np.asarray(y_resampled)

crossval_models = {
"KNNBayes": KNNBayes(k=5, plot=False)
}
crossval_foldwise_results(crossval_models, X_res_cv, y_res_cv, n_splits=10, save_path='knnbayes_knnsvm_crossval.csv')


==== KNNBayes ====
Fold 1 : acc=0.5365, spec=0.1537, sens=0.9187, gmean=0.3758, f1=0.6649, auc=0.6376, t1e=0.8463, t2e=0.0813
Fold 2 : acc=0.5180, spec=0.1167, sens=0.9187, gmean=0.3274, f1=0.6561, auc=0.6307, t1e=0.8833, t2e=0.0813
Fold 3 : acc=0.5190, spec=0.1241, sens=0.9131, gmean=0.3366, f1=0.6552, auc=0.6256, t1e=0.8759, t2e=0.0869
Fold 4 : acc=0.5282, spec=0.1481, sens=0.9076, gmean=0.3667, f1=0.6582, auc=0.6327, t1e=0.8519, t2e=0.0924
Fold 5 : acc=0.5282, spec=0.1352, sens=0.9205, gmean=0.3528, f1=0.6614, auc=0.6325, t1e=0.8648, t2e=0.0795
Fold 6 : acc=0.5356, spec=0.1553, sens=0.9167, gmean=0.3773, f1=0.6635, auc=0.6342, t1e=0.8447, t2e=0.0833
Fold 7 : acc=0.5217, spec=0.1165, sens=0.9278, gmean=0.3287, f1=0.6596, auc=0.6307, t1e=0.8835, t2e=0.0722
Fold 8 : acc=0.5356, spec=0.1479, sens=0.9241, gmean=0.3697, f1=0.6653, auc=0.6284, t1e=0.8521, t2e=0.0759
Fold 9 : acc=0.5356, spec=0.1479, sens=0.9241, gmean=0.3697, f1=0.6653, auc=0.6209, t1e=0.8521, t2e=0.0759
Fold 10 : acc=0.5

In [None]:
scaler_cv = StandardScaler()
X_res_cv = scaler_cv.fit_transform(X_resampled)
y_res_cv = np.asarray(y_resampled)

crossval_models = {
"KNNSVM": KNNSVM(k=5, plot=False)
}
crossval_foldwise_results(crossval_models, X_res_cv, y_res_cv, n_splits=10, save_path='knnbayes_knnsvm_crossval.csv')