In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import json

import kagglehub


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import ParameterGrid

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
files = os.listdir(path)
print("Content of", files)

csv_file = files[0]
csv_path = os.path.join(path, csv_file)

# Load DataFrame
df = pd.read_csv(csv_path) 

Content of ['alzheimers_disease_data.csv']


In [4]:
# Preprocessing

# Remove unnecessary columns
df = df.drop(columns=["PatientID", "DoctorInCharge"], errors="ignore")

# Splitting the data into features (X) and the target variable (Y)
X = df.drop(columns=["Diagnosis"])
Y = df["Diagnosis"]

# Label encoding
def change_labels(X):
    custom_labels = {
        'Gender': ['Male', 'Female'],
        'Ethnicity': ['Caucasian', 'African American', 'Asian', 'Other'],
        'EducationLevel': ['None', 'High School', "Bachelor's", 'Higher'],
        'Smoking': ['No', 'Yes'],
        'FamilyHistoryAlzheimers': ['No', 'Yes'],
        'CardiovascularDisease': ['No', 'Yes'],
        'Diabetes': ['No', 'Yes'],
        'Depression': ['No', 'Yes'],
        'HeadInjury': ['No', 'Yes'],
        'Hypertension': ['No', 'Yes'],
        'MemoryComplaints': ['No', 'Yes'],
        'BehavioralProblems': ['No', 'Yes'],
        'Confusion': ['No', 'Yes'],
        'Disorientation': ['No', 'Yes'],
        'PersonalityChanges': ['No', 'Yes'],
        'DifficultyCompletingTasks': ['No', 'Yes'],
        'Forgetfulness': ['No', 'Yes']
    }

    for column, labels in custom_labels.items():
        if column in X.columns:
            # Create a mapping dictionary from numeric values to custom labels
            label_mapping = {i: label for i, label in enumerate(labels)}
            # Replace values in the column using the mapping
            X[column] = X[column].replace(label_mapping)
    return X

# Assigning custom labels
X = change_labels(X)

# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [6]:
def objective(trial):
    # Przestrzeń wyszukiwania hiperparametrów
    max_bins = trial.suggest_categorical("max_bins", [128, 256, 512])
    max_interaction_bins = trial.suggest_categorical("max_interaction_bins", [32, 64, 128])
    interactions = trial.suggest_categorical("interactions", [0, 10, 50])
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1, log=True)
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [2, 10, 20])
    max_leaves = trial.suggest_categorical("max_leaves", [3, 5, 10])

    # Model EBM z sugerowanymi hiperparametrami
    ebm = ExplainableBoostingClassifier(
        max_bins=max_bins,
        max_interaction_bins=max_interaction_bins,
        interactions=interactions,
        learning_rate=learning_rate,
        min_samples_leaf=min_samples_leaf,
        max_leaves=max_leaves,
        random_state=42,
        n_jobs=-1
    )

    # Dopasowanie modelu na zestawie treningowym
    ebm.fit(X_train, Y_train)

    # Przewidywania na zbiorze walidacyjnym
    Y_pred = ebm.predict(X_test)

    # Obliczanie wyniku recall
    score = recall_score(Y_test, Y_pred, average="binary")
    return score

# Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Best recall score:", study.best_value)

[I 2025-01-27 22:38:01,465] A new study created in memory with name: no-name-099fac8e-9760-495b-827e-f06201d9c771
[I 2025-01-27 22:38:12,081] Trial 0 finished with value: 0.9078947368421053 and parameters: {'max_bins': 128, 'max_interaction_bins': 32, 'interactions': 0, 'learning_rate': 0.06338763840146938, 'min_samples_leaf': 2, 'max_leaves': 3}. Best is trial 0 with value: 0.9078947368421053.
[I 2025-01-27 22:38:17,407] Trial 1 finished with value: 0.9144736842105263 and parameters: {'max_bins': 256, 'max_interaction_bins': 64, 'interactions': 10, 'learning_rate': 0.0532979276120533, 'min_samples_leaf': 2, 'max_leaves': 5}. Best is trial 1 with value: 0.9144736842105263.
[I 2025-01-27 22:39:09,650] Trial 2 finished with value: 0.9144736842105263 and parameters: {'max_bins': 512, 'max_interaction_bins': 128, 'interactions': 50, 'learning_rate': 0.033378566186351585, 'min_samples_leaf': 20, 'max_leaves': 5}. Best is trial 1 with value: 0.9144736842105263.
[I 2025-01-27 22:39:26,734] Tr

Best hyperparameters: {'max_bins': 512, 'max_interaction_bins': 128, 'interactions': 10, 'learning_rate': 0.010226651476661184, 'min_samples_leaf': 2, 'max_leaves': 3}
Best recall score: 0.9276315789473685


In [None]:
# Fitting the Explainable Boosting Classifier model
ebm_model = ExplainableBoostingClassifier(random_state=42,
                                        max_bins=512,
                                        max_interaction_bins=128, 
                                        interactions=10, 
                                        learning_rate=0.010226651476661184, 
                                        min_samples_leaf=2, 
                                        max_leaves=3)
ebm_model.fit(X_train, Y_train)

# Generating predictions and probabilities on the test set
Y_pred = ebm_model.predict(X_test)
Y_pred_proba = ebm_model.predict_proba(X_test)[:, 1]

# Model evaluation
print("EBM - Classification Report:")
print(classification_report(Y_test, Y_pred))
print(f"EBM - AUC ROC: {roc_auc_score(Y_test, Y_pred_proba):.4f}")

In [37]:
ebm_local = ebm_model.explain_local(X_test[:10], Y_test[:10])
show(ebm_local)