In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import json

import kagglehub


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import ParameterGrid

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

from sklearn.model_selection import cross_val_score
import optuna


In [26]:
# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
files = os.listdir(path)
print("Content of", files)

csv_file = files[0]
csv_path = os.path.join(path, csv_file)

# Load DataFrame
df = pd.read_csv(csv_path) 

Content of ['alzheimers_disease_data.csv']


In [27]:
# Preprocessing

# Remove unnecessary columns
df = df.drop(columns=["PatientID", "DoctorInCharge"], errors="ignore")

# Splitting the data into features (X) and the target variable (Y)
X = df.drop(columns=["Diagnosis"])
Y = df["Diagnosis"]

# Label encoding
def change_labels(X):
    custom_labels = {
        'Gender': ['Male', 'Female'],
        'Ethnicity': ['Caucasian', 'African American', 'Asian', 'Other'],
        'EducationLevel': ['None', 'High School', "Bachelor's", 'Higher'],
        'Smoking': ['No', 'Yes'],
        'FamilyHistoryAlzheimers': ['No', 'Yes'],
        'CardiovascularDisease': ['No', 'Yes'],
        'Diabetes': ['No', 'Yes'],
        'Depression': ['No', 'Yes'],
        'HeadInjury': ['No', 'Yes'],
        'Hypertension': ['No', 'Yes'],
        'MemoryComplaints': ['No', 'Yes'],
        'BehavioralProblems': ['No', 'Yes'],
        'Confusion': ['No', 'Yes'],
        'Disorientation': ['No', 'Yes'],
        'PersonalityChanges': ['No', 'Yes'],
        'DifficultyCompletingTasks': ['No', 'Yes'],
        'Forgetfulness': ['No', 'Yes']
    }

    for column, labels in custom_labels.items():
        if column in X.columns:
            # Create a mapping dictionary from numeric values to custom labels
            label_mapping = {i: label for i, label in enumerate(labels)}
            # Replace values in the column using the mapping
            X[column] = X[column].replace(label_mapping)
    return X

# Assigning custom labels
X = change_labels(X)

# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [None]:
def objective(trial):
    # Przestrzeń wyszukiwania hiperparametrów
    max_bins = trial.suggest_categorical("max_bins", [128, 256, 512])
    max_interaction_bins = trial.suggest_categorical("max_interaction_bins", [32, 64, 128])
    interactions = trial.suggest_categorical("interactions", [0, 10, 50])
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1, log=True)
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [2, 10, 20])
    max_leaves = trial.suggest_categorical("max_leaves", [3, 5, 10])

    # Model EBM z sugerowanymi hiperparametrami
    ebm = ExplainableBoostingClassifier(
        max_bins=max_bins,
        max_interaction_bins=max_interaction_bins,
        interactions=interactions,
        learning_rate=learning_rate,
        min_samples_leaf=min_samples_leaf,
        max_leaves=max_leaves,
        random_state=42,
        n_jobs=-1
    )

    # Dopasowanie modelu na zestawie treningowym
    ebm.fit(X_train, Y_train)
    
    # Predykcja prawdopodobieństw na zestawie walidacyjnym
    Y_val_proba = ebm.predict_proba(X_test)[:, 1]

    # Obliczenie AUC na zestawie walidacyjnym
    score = roc_auc_score(Y_test, Y_val_proba)
    return score

# Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Best AUC score:", study.best_value)


[I 2025-01-27 21:16:50,107] A new study created in memory with name: no-name-a343bda0-5015-4e84-9e8e-87590e0ff7dd
[W 2025-01-27 21:16:53,058] Trial 0 failed with parameters: {'max_bins': 512, 'max_interaction_bins': 32, 'interactions': 0, 'learning_rate': 0.07637956650785142, 'min_samples_leaf': 10, 'max_leaves': 3} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/franek/Library/Caches/pypoetry/virtualenvs/alzheimer-s-disease-prediction-oDGSe6K4-py3.13/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/k9/32v63fqs51vf03_4h_y2p0500000gp/T/ipykernel_29967/3582260132.py", line 23, in objective
    ebm.fit(X_train, Y_train)
    ~~~~~~~^^^^^^^^^^^^^^^^^^
  File "/Users/franek/Library/Caches/pypoetry/virtualenvs/alzheimer-s-disease-prediction-oDGSe6K4-py3.13/lib/python3.13/site-packages/interpret/glassbox/_ebm/_ebm.py", line 1119, in fit
    results = pro

KeyboardInterrupt: 

In [15]:
# Model z ulepszonymi hiperparametrami
ebm_model = ExplainableBoostingClassifier(
    max_bins=256,
    max_interaction_bins=32,
    interactions=50,
    learning_rate=0.014,  # Zaokrąglona wartość z Optuny
    min_samples_leaf=10,
    max_leaves=3,
    n_jobs=-1,
    random_state=42
)

# Dopasowanie modelu do zbioru treningowego
ebm_model.fit(X_train, Y_train)

# Ocena na zbiorze testowym
Y_pred = ebm_model.predict(X_test)
Y_pred_proba = ebm_model.predict_proba(X_test)[:, 1]

print("EBM - Classification Report:")
print(classification_report(Y_test, Y_pred))
print(f"EBM - AUC ROC: {roc_auc_score(Y_test, Y_pred_proba):.4f}")


EBM - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       278
           1       0.92      0.93      0.92       152

    accuracy                           0.94       430
   macro avg       0.94      0.94      0.94       430
weighted avg       0.94      0.94      0.94       430

EBM - AUC ROC: 0.9432
