<a href="https://colab.research.google.com/github/muajnstu/DSK-Chain-to-predict-diabeties-/blob/main/Exploring_Voting_%26_Stacking_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [None]:
def print_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    num_classes = cm.shape[0]

    if num_classes == 2:
        # Binary classification
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(specificity * sensitivity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
        fmeasure = f1_score(y_true, y_pred, pos_label=1)
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob[:, 1])
            except Exception:
                auc = 0

    else:
        # Multiclass classification
        TP = np.diag(cm)
        FP = np.sum(cm, axis=0) - TP
        FN = np.sum(cm, axis=1) - TP
        TN = np.sum(cm) - (FP + FN + TP)

        specificity = np.mean([
            TN[i] / (TN[i] + FP[i]) if (TN[i] + FP[i]) > 0 else 0 for i in range(num_classes)
        ])
        sensitivity = np.mean([
            TP[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        gmean = np.sqrt(specificity * sensitivity)
        type1 = np.mean([
            FP[i] / (FP[i] + TN[i]) if (FP[i] + TN[i]) > 0 else 0 for i in range(num_classes)
        ])
        type2 = np.mean([
            FN[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        fmeasure = f1_score(y_true, y_pred, average='macro')

        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
            except Exception:
                auc = 0

    # Print or return results
    print(f"Accuracy      : {accuracy:.4f}")
    print(f"Sensitivity   : {sensitivity:.4f}")
    print(f"Specificity   : {specificity:.4f}")
    print(f"G-Mean        : {gmean:.4f}")
    print(f"Type I Error  : {type1:.4f}")
    print(f"Type II Error : {type2:.4f}")
    print(f"F1 Score      : {fmeasure:.4f}")
    print(f"AUROC         : {auc:.4f}")

def run_knn_variant(name, knn_clf):
    print(f"\n==== {name} ====")
    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)
    if hasattr(knn_clf, "predict_proba"):
        try:
            y_prob = knn_clf.predict_proba(X_test)
        except Exception:
            y_prob = None
    else:
        y_prob = None
    print_metrics(y_test, y_pred, y_prob)

In [None]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/ML-Datasets/refs/heads/main/filtered_df.csv')
X = df.drop(columns=['Cluster'])
y = df['Cluster']

# Handle Imbalanced Data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=46, stratify=y_resampled
)

In [None]:
# Recalculate covariance and variance with the current X_train after SMOTE
covariance_matrix = np.cov(X_train.T)
stabilized_covariance_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
inv_covariance_matrix = np.linalg.inv(stabilized_covariance_matrix)
variance_vector = np.var(X_train, axis=0)

knn_variants = {
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "DistanceKNN": KNeighborsClassifier(n_neighbors=3, weights='distance'),
    "GeneralizedKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3),
    "EuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
    "ManhattanKNN": KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
    "ChebyshevKNN": KNeighborsClassifier(n_neighbors=3, metric='chebyshev'),
    "MahalanobisKNN": KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix}),
    "SeuclideanKNN": KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': variance_vector}),
    "WminkowskiKNN": KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3, metric_params={'w': np.ones(X_train.shape[1])})
}

# Evaluate all KNN variants
for name, model in knn_variants.items():
    run_knn_variant(name, model)




# Prepare list of (name, estimator) tuples for ensemble
estimators = [(name, model) for name, model in knn_variants.items()]

# Voting Ensemble
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
print("\n==== Voting Ensemble ====")
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)
try:
    y_prob_voting = voting_clf.predict_proba(X_test)
except Exception:
    y_prob_voting = None
print_metrics(y_test, y_pred_voting, y_prob_voting)

# Stacking Ensemble (Logistic Regression as final estimator)
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=KNeighborsClassifier(n_neighbors=3, metric='mahalanobis', metric_params={'VI': inv_covariance_matrix}),
    passthrough=False,
    cv=5
)
print("\n==== Stacking Ensemble ====")
stacking_clf.fit(X_train, y_train)
y_pred_stacking = stacking_clf.predict(X_test)
try:
    y_prob_stacking = stacking_clf.predict_proba(X_test)
except Exception:
    y_prob_stacking = None
print_metrics(y_test, y_pred_stacking, y_prob_stacking)

print("\nProject complete! Check the above output for performance of each KNN variant and ensemble models.")


==== KNN ====
Accuracy      : 0.9217
Sensitivity   : 0.9217
Specificity   : 0.9608
G-Mean        : 0.9411
Type I Error  : 0.0392
Type II Error : 0.0783
F1 Score      : 0.9205
AUROC         : 0.9648

==== DistanceKNN ====
Accuracy      : 0.9276
Sensitivity   : 0.9276
Specificity   : 0.9638
G-Mean        : 0.9455
Type I Error  : 0.0362
Type II Error : 0.0724
F1 Score      : 0.9264
AUROC         : 0.9667

==== GeneralizedKNN ====
Accuracy      : 0.9268
Sensitivity   : 0.9268
Specificity   : 0.9634
G-Mean        : 0.9449
Type I Error  : 0.0366
Type II Error : 0.0732
F1 Score      : 0.9260
AUROC         : 0.9673

==== EuclideanKNN ====
Accuracy      : 0.9217
Sensitivity   : 0.9217
Specificity   : 0.9608
G-Mean        : 0.9411
Type I Error  : 0.0392
Type II Error : 0.0783
F1 Score      : 0.9205
AUROC         : 0.9648

==== ManhattanKNN ====
Accuracy      : 0.8881
Sensitivity   : 0.8881
Specificity   : 0.9440
G-Mean        : 0.9156
Type I Error  : 0.0560
Type II Error : 0.1119
F1 Score      

  return self._fit(X, y)


Accuracy      : 0.9351
Sensitivity   : 0.9351
Specificity   : 0.9676
G-Mean        : 0.9512
Type I Error  : 0.0324
Type II Error : 0.0649
F1 Score      : 0.9339
AUROC         : 0.9906

==== Stacking Ensemble ====


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Accuracy      : 0.9703
Sensitivity   : 0.9703
Specificity   : 0.9852
G-Mean        : 0.9777
Type I Error  : 0.0148
Type II Error : 0.0297
F1 Score      : 0.9702
AUROC         : 0.9874

Project complete! Check the above output for performance of each KNN variant and ensemble models.
