In [None]:
###XAI for Random Forest(seconf best)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained Random Forest Model
rf_model = joblib.load("glaucoma_model_Random_Forest.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (Random Forest)
# -------------------------------
feature_importance = pd.Series(rf_model.named_steps['model'].feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(rf_model.named_steps['model'], X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(rf_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], rf_model.predict_proba, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(rf_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values[..., 1], X_test_scaled, feature_names=X.columns, max_display=10)  # Class 1 (Glaucoma)


In [None]:
###XAI on SVM (second best)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained SVM Model
svm_model = joblib.load("glaucoma_model_SVM.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance via Permutation Importance (SVM has no built-in feature importance)
# -------------------------------
perm_importance = permutation_importance(svm_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Feature Importance (SVM)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = perm_importance_df["Feature"][:2].tolist()  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(svm_model, X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance (Direct Interpretation)
# -------------------------------
plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (SVM Model)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], svm_model.decision_function, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(svm_model.decision_function, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values, X_test_scale)

In [None]:
### XAI on knn model(best performing model)
import numpy as np
import pandas as pd
import joblib
import shap

import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for KNN model
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained KNN Model
knn_model = joblib.load("glaucoma_model_KNN.joblib")

# Train a Random Forest for Feature Importance (since KNN lacks built-in feature importance)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (Random Forest)
# -------------------------------
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the scaled dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick the first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(rf_model, X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the scaled dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick the first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(rf_model, X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(knn_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (KNN Model)")
plt.show()
# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], knn_model.predict_proba, num_features=5)
exp.show_in_notebook()
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(knn_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)
