In [1]:
%reload_ext autoreload
%autoreload 3
from src.acnets.pipeline import Parcellation
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.inspection import permutation_importance

from tqdm import tqdm
from joblib import delayed, Parallel

from IPython.display import clear_output

from src.acnets.pipeline import MultiScaleClassifier

from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch

from ray.tune.sklearn import TuneSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
# Input/Output
parcellation = Parcellation(atlas_name='dosenbach2010')
subjects = parcellation.fit_transform(None).coords['subject'].values
subject_labels = [s[:4] for s in subjects]
X = subjects.reshape(-1,1)                     # subjects, shape: (n_subjects, 1)
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(subject_labels)     # labels, shape: (n_subjects,)


# DEBUG (expected to overfit, i.e., score=1)
overfit_score = MultiScaleClassifier().fit(X, y).score(X, y)
print(f'[DEBUG] overfit accuracy: {overfit_score:.3f}')

[DEBUG] overfit accuracy: 1.000


In [29]:

param_space = {
    'clf__objective': ['binary:logistic'],
    'clf__max_depth': tune.randint(1, 20),
    'clf__min_child_weight': tune.randint(1, 10),
    'clf__subsample': tune.uniform(0.01, 1.0),
    'clf__eta': tune.loguniform(1e-4, 1e-1),
    'clf__learning_rate': (0.01, 1.0, 'log-uniform'),
    'clf__n_estimators': [25, 50, 100],
    # 'clf__max_depth': (0, 50),
    # 'clf__max_delta_step': (0, 20),
    # 'clf__reg_lambda': (1e-9, 1000, 'log-uniform'),
    # 'clf__reg_alpha': (1e-9, 1.0, 'log-uniform'),
    # 'clf__gamma': (1e-9, 0.5, 'log-uniform'),
    # 'clf__scale_pos_weight': (1e-6, 500, 'log-uniform')
}

tuner = TuneSearchCV(
    MultiScaleClassifier(),
    HyperOptSearch.convert_search_space(param_space),
    scoring='accuracy',
    cv=StratifiedShuffleSplit(n_splits=2, test_size=8),
    search_optimization='hyperopt',
    n_jobs=-1,
    refit=True,
    n_trials=100,
    verbose=1,
)

tuner.fit(X, y)

clear_output()

# create a model with best params
model = MultiScaleClassifier().set_params(**tuner.best_params_)
model

In [30]:
cv = StratifiedShuffleSplit(n_splits=100, test_size=8)

cv_scores = cross_val_score(model, X, y, cv=cv, verbose=3, n_jobs=-1)

clear_output(wait=True)
print(f'[CV] accuracy: {cv_scores.mean():.3f} +/- {cv_scores.std():.3f}')

[CV] accuracy: 0.679 +/- 0.144


In [None]:
# Feature Importance

permutation_cv = StratifiedShuffleSplit(n_splits=100, test_size=8)

def do_permutation_importance(estimator, X, y, train, test, scoring='accuracy'):
    """Perform permutation importance analysis on a given estimator."""
    estimator.fit(X[train], y[train])
    X_features_test = estimator.get_feature_extractor_head().transform(X[test])
    results = permutation_importance(estimator.get_classification_head(),
                                     X_features_test, y[test],
                                     n_jobs=-1,
                                     scoring=scoring)
    return results['importances_mean']

# run permutation importance in parallel
importance_scores = Parallel(n_jobs=-1, verbose=2)(
    delayed(do_permutation_importance)(model, X, y, train, test)
    for train, test in permutation_cv.split(X,y)
)

# convert to dataframe and sort
feature_names = model.fit(X, y).get_feature_names_out()
importance_scores = pd.DataFrame(
    data=np.stack(importance_scores, axis=0),
    columns=feature_names).mean().sort_values(ascending=False).to_frame('importance')

importance_scores[:20]  # top 20 features