# Connectivity Classifier

Steps:
1. Load the data
2. Fit a SVM + HPO
3. Permutation testing
4. Permutation Importance
5. SHAP

## Inputs

Connectivity matrices

## Outputs

- Classification output: Participant's label, either AVGP or NVGP.
- Results:
  - `models/connectivity_classifier_*.nc5`


## Requirements

To run this notebook, you need to have a few packages installed:

```bash
mamba create -n acnets python=3.9 jupyterlab ipykernel \
    matplotlib xarray netcdf4 shap nilearn=0.9.1
    # -c rapidsai -c nvidia -y \
    # rapids=22.04 cudatoolkit=11.5 \


mamba activate acnets

# pip install statannotations -U

# [Optional] Bayesian HBO
# pip install "ray[tune]" tune-sklearn scikit-optimize
```


## Analysis

In [1]:
# 0. SETUP

%reload_ext autoreload
%autoreload 3

import numpy as np
import pandas as pd
from pathlib import Path
import scipy.stats as st
import xarray as xr
from src.acnets.pipeline import ConnectivityPipeline, ConnectivityVectorizer
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.inspection import permutation_importance
from sklearn.model_selection import (GridSearchCV, StratifiedShuffleSplit,
                                     cross_val_score, learning_curve,
                                     permutation_test_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from tqdm.auto import tqdm

In [2]:
# 0.1. PARAMETERS

CV = StratifiedShuffleSplit(n_splits=10, test_size=8)
N_PERMUTATIONS = 10
N_TOP_MODELS = 10

MODELS_DIR= Path('models')

In [3]:
# 1. DATA

subjects = ConnectivityPipeline().transform('all').coords['subject'].values
groups = [s[:4] for s in subjects]  # AVGP or NVGP

X = subjects.reshape(-1, 1)

y_encoder = LabelEncoder()
y = y_encoder.fit_transform(groups)

In [4]:
#PREPARE OUTPUT

n_cv_fold = int(X.shape[0] / CV.test_size)

model_output_name = ('connectivities'
                     '_classifier-SVML1'
                     '_measure-accuracy'
                     f'_top-{N_TOP_MODELS}'
                     f'_cv-{CV.get_n_splits()}x{n_cv_fold}fold.nc5'
                     )

OUTPUT_PATH = MODELS_DIR / model_output_name

In [5]:
# 2. PIPELINE

from sklearn.svm import SVC

pipe  = Pipeline([
    ('connectivity', ConnectivityPipeline(atlas='friedman2020', kind='partial correlation', agg_networks=False)),
    ('vectorize', ConnectivityVectorizer()),
    ('scale', StandardScaler()),
    ('zerovar', VarianceThreshold()),
    ('select', SelectFromModel(LinearSVC(penalty='l1', dual=False, max_iter=10000),
                               max_features=lambda x: min(10, x.shape[1]))),
    ('clf', LinearSVC(penalty='l1', dual=False, max_iter=10000))
    # ('clf', SVC(kernel='linear', C=1))
])

# DEBUG
pipe.fit(X, y).score(X, y)

1.0

In [6]:
# 2.1. VERIFY THE MODEL
pipe.set_params(connectivity__atlas='friedman2020',
                connectivity__kind='covariance',
                connectivity__agg_networks=False)

scores = cross_val_score(pipe, X, y,
                         cv=CV,
                         scoring='accuracy',
                         n_jobs=-1)
bootstrap_ci = st.bootstrap(scores.reshape(1,-1), np.mean)
scores.mean(), scores.std(), bootstrap_ci

(0.6,
 0.1920286436967152,
 BootstrapResult(confidence_interval=ConfidenceInterval(low=0.4875, high=0.725), bootstrap_distribution=array([0.6125, 0.5625, 0.5875, ..., 0.5875, 0.775 , 0.6625]), standard_error=0.06021235204520828))

In [7]:
# 3. HPO: GRID SEARCH

param_grid = {
    'connectivity__agg_networks': [True, False],
    # 'connectivity__atlas': ['gordon2014_2mm', 'dosenbach2010', 'difumo_64_2mm'],
    'connectivity__atlas': ['friedman2020'],
    'connectivity__kind': ['partial correlation', 'tangent', 'correlation', 'covariance', 'precision'],
}

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=CV,
    verbose=1,
    scoring='accuracy')

grid.fit(X, y)

print('best estimator:', grid.best_estimator_)


Fitting 10 folds for each of 10 candidates, totalling 100 fits
best estimator: Pipeline(steps=[('connectivity',
                 ConnectivityPipeline(atlas='friedman2020',
                                      kind='covariance',
                                      agg_networks=False,
                                      mock=False,
                                      bids_dir='data/julia2018',
                                      parcellation_cache_dir='data/julia2018/derivatives/resting_timeseries/')),
                ('vectorize', ConnectivityVectorizer()),
                ('scale', StandardScaler()), ('zerovar', VarianceThreshold()),
                ('select',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=10000,
                                                     penalty='l1'),
                                 max_features=<function <lambda> at 0x7f6a20899ea0>)),
                ('clf', LinearSVC(dual=False, max_iter=10000, penalty='l1'))])


In [8]:
# 3.1. STORE GRID SEARCH RESULTS

#STORE pd.DataFrame(grid.cv_results_).set_index('params')
#STORE grid.scoring, grid.cv.test_size,  grid.cv.n_splits, n_subjects
grid_results = pd.DataFrame(grid.cv_results_)

grid_results['grid_model_name'] = grid_results['params'].apply(
    lambda x: [str(v) for v in x.values()]).apply(' '.join)

grid_results.set_index('grid_model_name', inplace=True)
grid_results.drop(columns=['params'], inplace=True)

ds_grid = grid_results.to_xarray()
ds_grid['scoring'] = grid.scoring
ds_grid['cv_test_size'] = CV.test_size
ds_grid['cv_n_splits'] = CV.n_splits
ds_grid['n_subjects'] = len(X)

In [9]:
# 4. PERMUTATION TEST (SHUFFLE Y)


perm_scores_agg = []
cv_scores_agg = []
pvalues = []
model_names = []

# sort by rank and take top n_top_models
top_models = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')[:N_TOP_MODELS].loc[:,'params'].to_list()

for p in tqdm(top_models):
    model_name = ' '.join([str(pp) for pp in p.values()])

    print(model_name)
    pipe.set_params(**p)

    # break if it's a low score

    _, perm_scores, pvalue = permutation_test_score(pipe, X, y,
                                                    scoring='accuracy',
                                                    n_permutations=N_PERMUTATIONS,
                                                    cv=CV,
                                                    n_jobs=-2, verbose=2)

    cv_scores = cross_val_score(pipe, X, y,
                                cv=CV,
                                scoring='accuracy', n_jobs=-2)

    perm_scores_agg.append(perm_scores)
    cv_scores_agg.append(cv_scores)
    pvalues.append(pvalue)
    model_names.append(model_name)

ds_perm = xr.Dataset({
    'perm_scores': (('model_name', 'permutation_dim'), perm_scores_agg),
    'cv_scores': (('model_name', 'cv_dim'), cv_scores_agg),
    'pvalue': (('model_name'), pvalues)},
    coords={'model_name': model_names})

ds_perm

  0%|          | 0/10 [00:00<?, ?it/s]

False friedman2020 covariance


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    1.4s remaining:    3.3s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    2.2s finished


False friedman2020 tangent


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    5.9s remaining:   13.7s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:   10.0s finished


True friedman2020 covariance


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    2.2s remaining:    5.2s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    3.5s finished


False friedman2020 precision


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    1.3s remaining:    3.0s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    2.1s finished


True friedman2020 tangent


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    2.6s remaining:    6.2s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    4.3s finished


False friedman2020 partial correlation


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    1.4s remaining:    3.3s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    2.2s finished


True friedman2020 correlation


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    2.3s remaining:    5.4s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    3.8s finished


True friedman2020 partial correlation


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    2.3s remaining:    5.3s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    3.6s finished


True friedman2020 precision


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    2.3s remaining:    5.3s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    3.6s finished


False friedman2020 correlation


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  10 | elapsed:    1.4s remaining:    3.4s
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:    2.3s finished


In [10]:
# 5. FEATURE IMPORTANCE (SHUFFLE X)

importances_agg = []

for p in top_models:
    model_name = ' '.join([str(pp) for pp in p.values()])

    pipe.set_params(**p)

    X_conn = pipe[:2].transform(X)
    feature_names = pipe[:2].get_feature_names_out()

    importances = []

    for train, test in tqdm(CV.split(X,y), total=CV.get_n_splits(X,y)):
        pipe.fit(X[train], y[train])

        results = permutation_importance(pipe[2:], X_conn[test], y[test],
                                        scoring=grid.scoring,
                                        n_jobs=-1)
        importances.append(results.importances.T)

    feature_dim_name = f'{model_name.split(" ")[0]}_feature'

    importances_ds = xr.Dataset({
        f'{model_name} importances': (('permutation_importance_num', feature_dim_name), np.vstack(importances))},
        coords={feature_dim_name: feature_names}
    )

    importances_agg.append(importances_ds)
    
    # sort by mean importance
    importances = pd.DataFrame(np.vstack(importances), columns=feature_names)
    sorted_columns = importances.mean(axis=0).sort_values(ascending=False).index
    importances = importances[sorted_columns]

ds_imp = xr.merge(importances_agg)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
# 6. SHAP

import shap

shap_agg = []

for p in top_models:
  model_name = ' '.join([str(pp) for pp in p.values()])

  shap_values_cv = []
  test_indices = []
  y_test_cv = []
  y_pred_cv = []

  pipe.set_params(**p)

  feature_names = pipe[:2].get_feature_names_out()

  n_splits = CV.get_n_splits(X, y)

  X_conn = pipe[:2].fit_transform(X, y)

  for train, test in tqdm(CV.split(X, y), total=n_splits):

      shap_model = pipe[2:].fit(X_conn[train], y[train])

      y_pred = shap_model.predict(X_conn[test])

      test_indices.extend(test)
      y_test_cv.append(y[test])
      y_pred_cv.append(y_pred)

      explainer = shap.Explainer(
          shap_model.predict, X_conn[train],
          feature_names=feature_names,
          # approximate=True,
          # model_output='raw',
          # feature_perturbation='interventional',
      )

      shap_values = explainer(X_conn[test], max_evals=1900)#, check_additivity=True)

      shap_values_cv.append(shap_values)

  # merge CV SHAPs

  # X = subjects.reshape(-1, 1)
  # X_test = pd.DataFrame(X[np.hstack(test_indices)], columns=['subject'])
  y_test = np.hstack(y_test_cv)
  y_pred = np.hstack(y_pred_cv)

  # merge CV SHAPs
  shap_values = shap.Explanation(
    values = np.vstack([sh.values for sh in shap_values_cv]),
    base_values = np.hstack([sh.base_values for sh in shap_values_cv]),
    data = np.vstack([sh.data for sh in shap_values_cv]),
    feature_names=feature_names,
    compute_time=np.sum([sh.compute_time for sh in shap_values_cv]),
    output_names=y_encoder.classes_,
    output_indexes=y_pred,
  )

  feature_dim_name = f'{model_name.split(" ")[1]}_feature'

  shap_ds = xr.Dataset({
    f'{model_name} shap': (('shap_dim', feature_dim_name), shap_values.values),
    f'{model_name} shap data': (('shap_dim', feature_dim_name), shap_values.data),
    f'{model_name} shap y_test': (('shap_dim'), y_encoder.inverse_transform(y_test)),
    f'{model_name} shap y_pred': (('shap_dim'), y_encoder.inverse_transform(y_pred)),
    },
    coords={feature_dim_name: feature_names}
  )

  shap_agg.append(shap_ds)

  # STORE y_pred, y_test, shap_values

ds_shap = xr.merge(shap_agg)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
# 7. LEARNING CURVE ANALYSIS

train_sizes, train_scores, test_scores = learning_curve(grid.best_estimator_, X, y,
                                                        cv=CV,
                                                        scoring='accuracy',
                                                        n_jobs=-1,
                                                        shuffle=True,
                                                        train_sizes=np.array([16, 18, 20, 22, 24]))


learning_curve_results = pd.DataFrame({
    'learning_curve_train_size': train_sizes,
    'learning_curve_mean_train_score': train_scores.mean(axis=1),
    'learning_curve_mean_test_score': test_scores.mean(axis=1)
})

learning_curve_results.index.name  = 'learning_curve_num'

ds_learning_curve = learning_curve_results.to_xarray()

learning_curve_results

Unnamed: 0_level_0,learning_curve_train_size,learning_curve_mean_train_score,learning_curve_mean_test_score
learning_curve_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16,1.0,0.55
1,18,1.0,0.475
2,20,1.0,0.5375
3,22,1.0,0.6125
4,24,1.0,0.5375


In [15]:
# 8. STORE RESULTS

results = xr.merge([
    {'X': xr.DataArray(X.flatten(), dims=['subject'])},
    {'y': xr.DataArray(y_encoder.inverse_transform(y), dims='subject')},
    {'y_classes': y_encoder.classes_},
    ds_grid, ds_learning_curve, ds_imp, ds_perm, ds_shap])

with open(OUTPUT_PATH, 'wb') as f:
    results.to_netcdf(f, engine='h5netcdf')
    results.close()

results = xr.open_dataset(OUTPUT_PATH, engine='scipy').load()
results