In [7]:
import numpy as np
import xarray as xr
import pandas as pd

from tqdm import tqdm

from sklearn import preprocessing, model_selection, ensemble, svm, metrics, feature_selection
from sklearn import dummy, model_selection, decomposition, cross_decomposition

import matplotlib.pyplot as plt

In [2]:
connectivity_kinds = [
    # 'covariance',
    'correlation',
    'partial_correlation',
    'tangent',
    # 'precision'
]

In [3]:
DATASETS = dict.fromkeys([
  'dosenbach2007',
  # 'difumo_64_2',
  # 'difumo_128_2',
  # 'difumo_1024_2'
  ], None)

# load the datasets
DATASETS = {
  ds_name: xr.open_dataset(f'data/julia2018_resting/connectivity_{ds_name}.nc')
  for ds_name in DATASETS.keys()}

In [13]:
scores = []

from sklearn import multioutput

model = ensemble.GradientBoostingRegressor()
model = multioutput.MultiOutputRegressor(model)

chance_model = dummy.DummyClassifier(strategy='uniform')

# feature importance: select predictive features of AVGP/NVGP
selector = feature_selection.SelectKBest(score_func=feature_selection.f_classif, k=100)


# transformer = decomposition.PCA(n_components=5)
# transformer = UMAP(n_components=5)
transformer = cross_decomposition.PLSRegression(n_components=5)

# cross-validation
# cv = model_selection.StratifiedShuffleSplit(n_splits=100, test_size=.2)
# cv = model_selection.LeavePOut(p=2)
cv = model_selection.LeaveOneOut()

for ds_name, dataset in DATASETS.items():

    valid_beh_subjects_mask = ~np.isnan(dataset['inverse_efficiency_score_ms'].values)
    
    for kind in connectivity_kinds:
        
        # vectorized connectivity matrix (only upper triangle)
        X = np.array([subj_conn[np.triu_indices_from(subj_conn, k=1)] 
                      for subj_conn in dataset[f'{kind}_connectivity'].values])
        X = X[valid_beh_subjects_mask]
        
        y = preprocessing.LabelEncoder().fit_transform(dataset['group'])
        y = y[valid_beh_subjects_mask]

        y_beh = dataset['inverse_efficiency_score_ms'].values
        y_beh = y_beh[valid_beh_subjects_mask]

        for train, test in tqdm(cv.split(X, y),
                                desc=f'{ds_name} ({kind})',
                                total=cv.get_n_splits(X)):
            X_train, X_test = X[train], X[test]

            # feature importance
            # X_train = selector.fit_transform(X[train], y[train])
            # X_test = selector.transform(X[test])

            # dimensionality reduction
            # X_train = reducer.fit_transform(X[train], y[train])
            # X_test = reducer.transform(X[test])
            X_train, y_train = transformer.fit_transform(X_train, y[train])
            X_test, y_test = transformer.transform(X_test, y[test])

            # classification
            model.fit(X_train, y_train)
            chance_model.fit(X_train, y_train)

            # prediction
            y_pred = model.predict(X_test)
            y_chance = chance_model.predict(X_test)

            # evaluate
            # score = metrics.roc_auc_score(y[test], y_pred)
            # score = metrics.accuracy_score(y[test], y_pred)
            score = metrics.mean_squared_error(y_test, y_pred)
            scores.append((ds_name, kind, score))

            # DEBUG chance level
            # chance_score = metrics.roc_auc_score(y[test], y_chance)
            # chance_score = metrics.accuracy_score(y[test], y_chance)      
            chance_score = metrics.mean_squared_error(y_test, y_chance)
            scores.append((ds_name + '_chance', kind, chance_score))

scores = pd.DataFrame(scores, columns=['dataset', 'kind', 'score'])

dosenbach2007 (correlation): 100%|██████████| 32/32 [00:05<00:00,  5.54it/s]
dosenbach2007 (partial_correlation): 100%|██████████| 32/32 [00:05<00:00,  5.57it/s]
dosenbach2007 (tangent): 100%|██████████| 32/32 [00:05<00:00,  5.43it/s]


In [14]:
scores.groupby(['dataset','kind']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,score
dataset,kind,Unnamed: 2_level_1
dosenbach2007,correlation,13.774378
dosenbach2007,partial_correlation,9.736999
dosenbach2007,tangent,11.038092
dosenbach2007_chance,correlation,11.475997
dosenbach2007_chance,partial_correlation,10.193122
dosenbach2007_chance,tangent,11.954167


In [None]:
import pandas as pd
import seaborn as sns; sns.set()

fig, axes = plt.subplots(2, 1, figsize=(7, 10))

# box/violin plot
g = sns.boxplot(data=scores,
            y='dataset',
            x='score',
            hue='kind',
            dodge=True,
            orient='horizontal', ax=axes[0])
g.set(ylabel='atlas', xlabel='score')
g.legend(loc='upper right', ncol=2, bbox_to_anchor=(1.6, 1.02), title='Connectivity kind')


# dist plot
g = sns.kdeplot(
  data=scores,
  x='score', hue='dataset',
  fill=True, clip=(0,1), alpha=.12, cut=0,
  ax=axes[1])

g.set(ylabel='density')
axes[1].get_legend().set_title('connectivity measure')
axes[1].get_legend().set_bbox_to_anchor((1.05, 1))

# plot chance line
axes[0].axvline(.5, color='red', linestyle='--')
axes[1].axvline(.5, color='red', linestyle='--')

plt.suptitle('Classification scores (cross-validated ROC-AUC)'
             '\nRed lines are chance level; both plots use the same data.')

plt.show()