<a href="https://colab.research.google.com/github/muajnstu/Comparative-Analysis-of-KNN-Variants-for-Diabetes-Prediction-Using-Administrative-Health-data/blob/main/SHAP_Summary_plot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# 1. Load data
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/ML-Datasets/refs/heads/main/filtered_df.csv')
X = df.drop(columns=['Cluster'])
y = df['Cluster']

# 2. Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 3. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    random_state=46,
    stratify=y_resampled
)


In [None]:
# Recalculate covariance and variance with the current X_train after SMOTE
covariance_matrix = np.cov(X_train.T)
stabilized_covariance_matrix = covariance_matrix + np.eye(covariance_matrix.shape[0]) * 1e-6
inv_covariance_matrix = np.linalg.inv(stabilized_covariance_matrix)
variance_vector = np.var(X_train, axis=0)

# Fit best performing model
knn = KNeighborsClassifier(n_neighbors=3, metric='seuclidean', metric_params={'V': variance_vector})
knn.fit(X_train, y_train)

def predict_fn(X):
    return knn.predict_proba(X)

background = shap.kmeans(X_train, 50)

# SHAP KernelExplainer
explainer = shap.KernelExplainer(predict_fn, background)
X_shap = X_test

# Compute SHAP values
shap_values = explainer.shap_values(X_shap)

# For multiclass:
class_idx = 0  # Choose the class of interest, e.g., 0
shap.summary_plot(shap_values[class_idx], X_shap, feature_names=X.columns, show=True)

# Local feature importance: waterfall plot for a specific test sample
i = 5  # index of test sample (change as needed)
shap.waterfall_plot(shap.Explanation(
    values=shap_values[class_idx][i],
    base_values=explainer.expected_value[class_idx],
    data=X_shap.iloc[i],
    feature_names=X.columns.tolist()
))



  0%|          | 0/2528 [00:00<?, ?it/s]

