# Cerebellum Connectivity Classifier

Steps:
1. Load the data
2. Extract the cerebellum features (from DiFuMo atlas)
2. Fit a SVM + HPO


## Inputs

Cerebellum activities from the DiFuMo atlas.

## Outputs

- Classification output: Participant's label, either AVGP or NVGP.
- Results:
  - `models/cerebellum_classifier_*.nc`


## Requirements

To run this notebook, you need to activate `acnets` environment using `conda activate acnets`.

# TODO:
- Add support for cerebellum in the ConnectivityPipeline


In [1]:
# 0. SETUP

%reload_ext autoreload
%autoreload 3

import numpy as np
import pandas as pd
from pathlib import Path
import scipy.stats as st
import xarray as xr
from src.acnets.pipeline import CerebellumPipeline, ConnectivityVectorizer
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.inspection import permutation_importance
from sklearn.model_selection import (GridSearchCV, StratifiedShuffleSplit,
                                     cross_val_score, learning_curve,
                                     permutation_test_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from tqdm.auto import tqdm

import faulthandler


faulthandler.enable()

In [2]:
# 0.1. PARAMETERS

CV = StratifiedShuffleSplit(n_splits=5, test_size=8)
N_PERMUTATIONS = 10
N_TOP_MODELS = 5

MODELS_DIR= Path('models')

In [3]:
# 1. DATA

subjects = CerebellumPipeline().transform('all').coords['subject'].values
groups = [s[:4] for s in subjects]  # AVGP or NVGP

X = subjects.reshape(-1, 1)

y_encoder = LabelEncoder()
y = y_encoder.fit_transform(groups)

In [4]:
#PREPARE OUTPUT

n_cv_fold = int(X.shape[0] / CV.test_size)

model_output_name = ('cerebellum'
                     '_classifier-SVML1'
                     '_scoring-accuracy'
                     f'_top-{N_TOP_MODELS}'
                     f'_cv-{CV.get_n_splits()}x{n_cv_fold}fold.nc5'
                     )

OUTPUT_PATH = MODELS_DIR / model_output_name

In [5]:
# 2. PIPELINE

from sklearn.decomposition import PCA
from sklearn.svm import SVC

from tqdm.auto import tqdm
from IPython.display import display


pipe  = Pipeline([
    ('connectivity', CerebellumPipeline(kind='precision')),
    ('vectorize', ConnectivityVectorizer()),
    ('scale', StandardScaler()),
    ('zerovar', VarianceThreshold()),
    ('pca', PCA(n_components=0.99)),
    # ('select', SelectFromModel(LinearSVC(penalty='l2', max_iter=1000),
    #                         max_features=lambda x: min(10, x.shape[1]))),
    ('clf', LinearSVC(penalty='l1', dual=False, max_iter=10000))
    # ('clf', SVC(kernel='linear'))

])

for res in (pbar1 := tqdm([64, 128, 256, 512, 1024])):

    scores = []

    pbar1.set_description(f"DiFuMo_{res}")
    for max_features in (pbar2 := tqdm(range(1, 2))):
        pbar2.set_description(f"[DiFuMo_{res}] {max_features} features")

        pipe.set_params(connectivity__atlas_dimension=res)#, select__max_features=max_features)

        # DEBUG
        score = pipe.fit(X, y).score(X, y)  # expects 1.0
        scores.append([res, max_features, score])
    scores = pd.DataFrame(scores, columns=['resolution', 'max_features', 'train_accuracy'])
    display(scores)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,resolution,max_features,train_accuracy
0,64,1,0.65625


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,resolution,max_features,train_accuracy
0,128,1,1.0


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,resolution,max_features,train_accuracy
0,256,1,1.0


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,resolution,max_features,train_accuracy
0,512,1,1.0


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,resolution,max_features,train_accuracy
0,1024,1,1.0


In [6]:
# 2.1. verify the cv mean/ci of the model

pipe.set_params(connectivity__atlas_dimension=128, connectivity__kind='tangent', connectivity__agg_networks=False)

scores = cross_val_score(pipe, X, y,
                         cv=CV,
                         scoring='accuracy',
                         verbose=2,
                         n_jobs=-2)
bootstrap_ci = st.bootstrap(scores.reshape(1,-1), np.mean)
scores.mean(), scores.std(), bootstrap_ci

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.


[CV] END .................................................... total time=   8.2s
[CV] END .................................................... total time=   8.3s
[CV] END .................................................... total time=   8.3s
[CV] END .................................................... total time=   8.3s
[CV] END .................................................... total time=   8.3s


[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:   12.3s remaining:   18.5s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.4s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.4s finished


(0.55,
 0.1274754878398196,
 BootstrapResult(confidence_interval=ConfidenceInterval(low=0.45, high=0.675), bootstrap_distribution=array([0.65 , 0.625, 0.575, ..., 0.65 , 0.5  , 0.575]), standard_error=0.05689838904357655))

In [7]:
# 3. HPO: GRID SEARCH

param_grid = {
    'connectivity__atlas_dimension': [128, 1024],
    # 'connectivity__atlas': ['seitzman2018'],
    'connectivity__kind': ['tangent', 'precision'],
}

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=CV,
    verbose=2,
    n_jobs=-2,
    scoring='accuracy')

grid.fit(X, y)

print('best estimator:', grid.best_estimator_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END connectivity__atlas_dimension=128, connectivity__kind=precision; total time=  23.5s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=precision; total time=  23.5s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=precision; total time=  23.5s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=precision; total time=  23.6s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=tangent; total time=  23.7s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=tangent; total time=  38.2s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=precision; total time=  38.3s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=tangent; total time=  38.4s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=tangent; total time=  38.4s
[CV] END connectivity__atlas_dimension=128, connectivity__kind=tangent; total time=  38.4s
[CV] END connectivit

In [8]:
# 3.1. STORE GRID SEARCH RESULTS

#STORE pd.DataFrame(grid.cv_results_).set_index('params')
#STORE grid.scoring, grid.cv.test_size,  grid.cv.n_splits, n_subjects
grid_results = pd.DataFrame(grid.cv_results_)

grid_results['grid_model_name'] = grid_results['params'].apply(lambda x: [str(xx) for xx in x.values()]).apply(lambda x: ' '.join(x))
grid_results.set_index('grid_model_name', inplace=True)
grid_results.drop(columns=['params'], inplace=True)

ds_grid = grid_results.to_xarray()
ds_grid['scoring'] = grid.scoring
ds_grid['cv_test_size'] = CV.test_size
ds_grid['cv_n_splits'] = CV.n_splits
ds_grid['n_subjects'] = len(X)

In [None]:
# 4. PERMUTATION TEST (SHUFFLE Y)


perm_scores_agg = []
cv_scores_agg = []
pvalues = []
model_names = []

# sort by rank and take top n_top_models
top_models = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')[:N_TOP_MODELS].loc[:,'params'].to_list()

for p in tqdm(top_models):
    model_name = ' '.join([str(pp) for pp in p.values()])
    
    pipe.set_params(**p)

    # break if it's a low score

    _, perm_scores, pvalue = permutation_test_score(pipe, X, y,
                                                    scoring='accuracy',
                                                    n_permutations=N_PERMUTATIONS,
                                                    cv=CV,
                                                    n_jobs=-2, verbose=2)

    cv_scores = cross_val_score(pipe, X, y,
                                cv=CV,
                                scoring='accuracy', n_jobs=-2)

    perm_scores_agg.append(perm_scores)
    cv_scores_agg.append(cv_scores)
    pvalues.append(pvalue)
    model_names.append(model_name)

ds_perm = xr.Dataset({
    'perm_scores': (('model_name', 'permutation_dim'), perm_scores_agg),
    'cv_scores': (('model_name', 'cv_dim'), cv_scores_agg),
    'pvalue': (('model_name'), pvalues)},
    coords={'model_name': model_names})

In [None]:
# 5. FEATURE IMPORTANCE (SHUFFLE X)

importances_agg = []

for p in top_models:
    model_name = ' '.join([str(pp) for pp in p.values()])

    pipe.set_params(**p)

    X_conn = pipe[:2].transform(X)
    feature_names = pipe[:2].get_feature_names_out()

    importances = []

    for train, test in tqdm(CV.split(X,y), total=CV.get_n_splits(X,y)):
        pipe.fit(X[train], y[train])

        results = permutation_importance(pipe[2:], X_conn[test], y[test],
                                        scoring=grid.scoring,
                                        n_jobs=-1)
        importances.append(results.importances.T)

    feature_dim_name = f'{model_name.split(" ")[0]}_feature'

    importances_ds = xr.Dataset({
        f'{model_name} importances': (('permutation_importance_num', feature_dim_name), np.vstack(importances))},
        coords={feature_dim_name: feature_names}
    )

    importances_agg.append(importances_ds)
    
    # sort by mean importance
    # importances = pd.DataFrame(np.vstack(importances), columns=feature_names)
    # sorted_columns = importances.mean(axis=0).sort_values(ascending=False).index
    # importances = importances[sorted_columns]

ds_imp = xr.merge(importances_agg)

In [None]:
# 8. STORE RESULTS

results = xr.merge([
    {'X': xr.DataArray(X.flatten(), dims=['subject'])},
    {'y': xr.DataArray(y_encoder.inverse_transform(y), dims='subject')},
    {'y_classes': y_encoder.classes_},
    ds_grid, ds_imp, ds_perm])

with open(OUTPUT_PATH, 'wb') as f:
    results.to_netcdf(f, engine='h5netcdf')
    results.close()

results = xr.open_dataset(OUTPUT_PATH, engine='scipy').load()
results