In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import json

import kagglehub


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import ParameterGrid

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
files = os.listdir(path)
print("Content of", files)

csv_file = files[0]
csv_path = os.path.join(path, csv_file)

# Load DataFrame
df = pd.read_csv(csv_path)

Content of ['alzheimers_disease_data.csv']


In [3]:
# Preprocessing

# Remove unnecessary columns
df = df.drop(columns=["PatientID", "DoctorInCharge"], errors="ignore")

# Splitting the data into features (X) and the target variable (Y)
X = df.drop(columns=["Diagnosis"])
Y = df["Diagnosis"]


# Label encoding
def change_labels(X):
    custom_labels = {
        "Gender": ["Male", "Female"],
        "Ethnicity": ["Caucasian", "African American", "Asian", "Other"],
        "EducationLevel": ["None", "High School", "Bachelor's", "Higher"],
        "Smoking": ["No", "Yes"],
        "FamilyHistoryAlzheimers": ["No", "Yes"],
        "CardiovascularDisease": ["No", "Yes"],
        "Diabetes": ["No", "Yes"],
        "Depression": ["No", "Yes"],
        "HeadInjury": ["No", "Yes"],
        "Hypertension": ["No", "Yes"],
        "MemoryComplaints": ["No", "Yes"],
        "BehavioralProblems": ["No", "Yes"],
        "Confusion": ["No", "Yes"],
        "Disorientation": ["No", "Yes"],
        "PersonalityChanges": ["No", "Yes"],
        "DifficultyCompletingTasks": ["No", "Yes"],
        "Forgetfulness": ["No", "Yes"],
    }

    for column, labels in custom_labels.items():
        if column in X.columns:
            # Create a mapping dictionary from numeric values to custom labels
            label_mapping = {i: label for i, label in enumerate(labels)}
            # Replace values in the column using the mapping
            X[column] = X[column].replace(label_mapping)
    return X


# Assigning custom labels
X = change_labels(X)

# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [63]:
# Fitting the Explainable Boosting Classifier model
ebm_model = ExplainableBoostingClassifier(random_state=42)
ebm_model.fit(X_train, Y_train)

# Generating predictions and probabilities on the test set
Y_pred = ebm_model.predict(X_test)
Y_pred_proba = ebm_model.predict_proba(X_test)[:, 1]

# Model evaluation
print("EBM - Classification Report:")
print(classification_report(Y_test, Y_pred))
print(f"EBM - AUC ROC: {roc_auc_score(Y_test, Y_pred_proba):.4f}")

EBM - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       278
           1       0.93      0.93      0.93       152

    accuracy                           0.95       430
   macro avg       0.94      0.94      0.94       430
weighted avg       0.95      0.95      0.95       430

EBM - AUC ROC: 0.9404


In [None]:
# Define the model
ebm_model = ExplainableBoostingClassifier(random_state=42)

# Define the hyperparameter grid
param_grid = {
    "max_bins": [128, 256, 512],  # Number of bins for discretization
    "max_interaction_bins": [32, 64, 128],  # Number of bins for interactions
    "interactions": [0, 10, 50],  # Number of interactions
    "learning_rate": [0.01, 0.05, 0.1],  # Learning rate
    "min_samples_leaf": [2, 10, 20],  # Minimum number of samples per leaf
    "max_leaves": [3, 5, 10],  # Maximum number of leaves per tree
}

# Use GridSearchCV
grid_search = GridSearchCV(
    estimator=ebm_model,
    param_grid=param_grid,
    scoring="roc_auc",  # Metric for evaluation
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Verbosity level
    n_jobs=-1,  # Use all available processors
)

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Best AUC score
print("Best AUC score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_ebm_model = grid_search.best_estimator_
Y_pred = best_ebm_model.predict(X_test)
Y_pred_proba = best_ebm_model.predict_proba(X_test)[:, 1]

print("EBM - Classification Report:")
print(classification_report(Y_test, Y_pred))
print(f"EBM - AUC ROC: {roc_auc_score(Y_test, Y_pred_proba):.4f}")

In [None]:
# ebm_local = ebm_model.explain_local(X_test[:5], Y_test[:5])
# show(ebm_local)

ebm_global = ebm_model.explain_global()
show(ebm_global)