# Connectivity Classifier


## Inputs

Connectivity matrices

## Outputs

Participant's label, either AVGP or NVGP.

## Requirements

To run this notebook, you need to have the rapids packages installed:

```bash
mamba create -n rapids-22.04 \
    -c rapidsai -c nvidia -y \
    rapids=22.04 python=3.9 cudatoolkit=11.5 \
    jupyterlab ipykernel nilearn xarray netcdf4 seaborn

mamba activate rapids-22.04

pip install "ray[tune]" tune-sklearn scikit-optimize
```

In [1]:
import os
from pathlib import Path

import numpy as np
import scipy.stats as st
import seaborn as sns
from cuml.model_selection import GridSearchCV
from IPython.display import clear_output, display
from python.acnets.pipeline import ConnectivityPipeline, ConnectivityVectorizer
from ray import tune
from ray.tune.sklearn import TuneSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

from cuml import PCA, SVC, UMAP, LinearSVC

os.environ['CUDA_VISIBLE_DEVICES'] = '0'


In [2]:
# Parameters

#  WARNING: Ray Tune requires data paths to be absolute.
BIDS_DIR = Path.home() / 'workspace/acnets/data/julia2018'
CACHE_DIR = Path.home() / 'workspace/acnets/data/julia2018_resting'

In [3]:
subjects = ConnectivityPipeline().transform('all').coords['subject'].values
groups = [s[:4] for s in subjects]  # AVGP or NVGP

X = subjects.reshape(-1, 1)

y_encoder = LabelEncoder()
y = y_encoder.fit_transform(groups)

In [114]:
pipe  = Pipeline([
    ('connectivity', ConnectivityPipeline(bids_dir=BIDS_DIR, parcellation_cache_dir=CACHE_DIR)),
    ('vectorize', ConnectivityVectorizer()),
    ('scale', 'passthrough'),
    ('zerovar', 'passthrough'),
    ('select', SelectFromModel(LinearSVC(penalty='l2'))),
    ('reduce', 'passthrough'),
    ('clf', LinearSVC(probability=True, penalty='l2'))
])

# DEBUG
pipe.set_params(connectivity__kind='chatterjee')

# pipe.fit(X, y).score(X, y)

scores = cross_val_score(pipe, X, y,
                         cv=StratifiedShuffleSplit(n_splits=1000, test_size=8),
                         scoring='accuracy', n_jobs=-1)
bootstrap_ci = st.bootstrap(scores.reshape(1,-1), np.mean)
scores.mean(), scores.std(), bootstrap_ci

(0.66375,
 0.1408622997824471,
 BootstrapResult(confidence_interval=ConfidenceInterval(low=0.6548597658937988, high=0.672375), standard_error=0.004426939565528785))

In [None]:

(0.700875,
 0.14982025689138304,
 BootstrapResult(confidence_interval=ConfidenceInterval(low=0.691125, high=0.709625), standard_error=0.00469138130343869))

In [123]:
param_space = {
    # 'scale': tune.choice([StandardScaler(), 'passthrough']),
    # 'select': tune.choice([SelectFromModel(LinearSVC(penalty='l2')), 'passthrough']),
    'clf__C': tune.loguniform(1e-2, 1e3),
    'connectivity__atlas': tune.grid_search(['dosenbach2010']),#, 'gordon2014_2mm', 'difumo_64_2mm', 'seitzman2018'],
    'connectivity__kind': tune.grid_search(['partial correlation', 'chatterjee', 'correlation', 'tangent']),
}

opt = TuneSearchCV(
    pipe,
    param_space,
    cv=4,
    n_trials=100,
    # early_stopping=True,
    max_iters=2,
    # search_optimization='bayesian',
    verbose=1,
    scoring='accuracy',
    use_gpu=True,
    n_jobs=-1)

opt.fit(X, y)

print('best estimator:', opt.best_estimator_)


In [118]:


cv = StratifiedShuffleSplit(n_splits=1000, test_size=0.25)

model = opt.best_estimator_
# model = pipe
# model = DummyClassifier(strategy='stratified')

scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

bootstrap_ci = st.bootstrap(scores.reshape(1,-1), np.mean)

scores.mean(), scores.std(), bootstrap_ci

# sns.displot(scores)

(0.691875,
 0.14930960911810065,
 BootstrapResult(confidence_interval=ConfidenceInterval(low=0.68225, high=0.70075), standard_error=0.004733749131881141))

In [67]:
from sklearn.model_selection import learning_curve
import pandas as pd

train_sizes, train_scores, test_scores = learning_curve(model, X, y,
                                                        cv=StratifiedShuffleSplit(n_splits=10, test_size=8),
                                                        scoring='accuracy',
                                                        n_jobs=-1,
                                                        shuffle=True,
                                                        train_sizes=np.array([16, 18, 20, 22, 24]))


learning_curve_results = pd.DataFrame({
    'train_size': train_sizes,
    'mean_train_score': train_scores.mean(axis=1),
    'mean_test_score': test_scores.mean(axis=1)
})

learning_curve_results

KeyboardInterrupt: 