
This notebooks extracts 216 connectivity measures using the `pyspi` package, and then train a AVGPvsNVGP classifier using the connectivity features.

**:warning: Warning:** This notebook requires packages that are not compatible with the current version of Python. To run this notebook, you need to create a separate conda environment using the following command:

```bash
mamba create -n pyspi python=3.9 octave openjdk
mamba activate pyspi
pip install -U pyspi scikit-learn xgboost xarray h5netcdf
```


In [6]:
%reload_ext autoreload
%autoreload 2

In [7]:
from tqdm.auto import tqdm
import xarray as xr
from pyspi.calculator import CalculatorFrame
from pyspi.data import Data

from src.multimodal.preprocessing import TimeseriesAggregator

# we only aggregate the region-wise timeseries into network-wise timeseries
preproc_pipe = TimeseriesAggregator(strategy='network')

atlas = 'dosenbach2010'

with xr.open_dataset(f'data/Julia2018/timeseries_{atlas}.nc5') as ds:
    ds.load()
    ds = preproc_pipe.fit_transform(ds)

datasets = []
process_type = ds['timeseries'].dims[-1]
process_names = ds.coords[process_type].values
for subject in ds.coords['subject'].values:
    ts = ds.sel(subject=subject)['timeseries'].values.T
    dataset = Data(ts, procnames=process_names, name=subject)
    datasets.append(dataset)

calc = CalculatorFrame(datasets=datasets, subset='fabfour',
                       name=f'Julia2018_{atlas}_{process_type}',
                       names=[d.name for d in datasets])
calc.compute()
clear_output()

In [8]:

# spis = spi_calc.table
# spis = s.columns.get_level_values(0).unique()

# spi_calc._get_correlation_df()

tables = {
    c.name: c.table
    for i, c in calc.calculators.itertuples()
}

import pandas as pd

spis = []

for i, c in tqdm(calc.calculators.itertuples(), total=calc.n_calculators):
    feats = c.table
    feats.index.name = 'process_1'
    feats = feats.reset_index()
    feats.columns.names = ['spi', 'process_2']
    melted = pd.melt(feats, id_vars='process_1', var_name=['spi', 'process_2'], value_name='value')
    melted['process'] = melted.apply(lambda x: set(x[['process_1', 'process_2']]), axis=1)
    melted = melted.groupby('spi').apply(lambda x: x.drop_duplicates('process'))
    # melted.dropna(subset=['value'], inplace=True)
    melted['process'] = melted['process_1'] + '-' + melted['process_2']
    melted.drop(columns=['spi', 'process_1', 'process_2'], inplace=True)
    melted.reset_index(level=0, inplace=True)
    melted.reset_index(drop=True, inplace=True)
    melted = melted.assign(subject=c.name, label=c.name[:4])
    spis.append(melted)
spi_df = pd.concat(spis)
spi_df_wide = spi_df.pivot_table(index=['subject', 'label', 'spi'], columns=['process'], values='value', aggfunc='mean').reset_index()
spi_df_wide.to_csv(f'data/Julia2018/spis_{atlas}_{process_type}.csv', index=False)

100%|██████████| 32/32 [00:02<00:00, 14.63it/s]


## SPI Classifiers

In [32]:
from IPython.display import clear_output
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, LeaveOneOut
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier

def score_spi(s):
    print(s.name, '...', end=' ')
    estimator = Pipeline([
        # ('scaler', StandardScaler()),
        # ('scale', MinMaxScaler(feature_range=(-1, 1))),
        ('clf', SVC(kernel='linear'))
        # ('clf', XGBClassifier())
    ])
    X = s.drop(columns=['subject', 'spi', 'label']).values
    y = LabelEncoder().fit_transform(s['label'].values)
    CV = StratifiedShuffleSplit(n_splits=1000, test_size=8)
    # CV = LeaveOneOut()
    score = cross_val_score(estimator, X, y, cv=CV, n_jobs=-1, scoring='accuracy')
    print(f'acc={score.mean()}')
    return score.mean()

atlas = 'dosenbach2010'
spi_df_wide = pd.read_csv(f'data/Julia2018/spis_{atlas}_{process_type}.csv')
s = spi_df_wide.groupby(['spi']).apply(lambda x: x.isna().sum().sum())

null_spis = s[s>0].index
spi_df_wide = spi_df_wide.query('spi not in @null_spis')
scores = spi_df_wide.groupby(['spi']).apply(score_spi).sort_values(ascending=False)
clear_output()
scores

spi
spearmanr-sq               0.639000
cov_EmpiricalCovariance    0.631250
di_gaussian                0.552125
pec                        0.543125
dtype: float64