Resting-state preprocessing results in five kinds of individual-level functional connectivity matrices, $W\text{s}$, each of size $32\times39\times39$, where $W_{ijk}$ is the connectivity between the time-series of nodes $j$ and $k$ in the control network of the subject $i$.

The connectivity matrices are symmetric, so we only need to keep the upper triangular part of the matrix. This results in a (32, 741) matrix, where each columns is a connectivity measure between two nodes.

In [None]:

import numpy as np
import xarray as xr
import pandas as pd

from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.express as px

from sklearn import preprocessing, model_selection, metrics, ensemble, multioutput
from sklearn import decomposition, cross_decomposition, feature_selection
from sklearn.pipeline import Pipeline
# from xgboost import XGBClassifier  # incompatible with umap's scipy

sns.set(style='dark')

from umap import UMAP

# Technical reproducibility
%reload_ext watermark
%watermark -iv -co -ituhmv

In [None]:
# 1. load the connectivity dataset
DATASET = xr.open_dataset('data/julia2018_resting/connectivities.nc').load()

# 1. input (connectivities)
X = np.array(
  [subj_conn[np.triu_indices_from(subj_conn, k=1)] 
   for subj_conn in DATASET['connectivity'].values]
  )

# 2. classifier output (AVGP or NVGP)
y_encoder = preprocessing.LabelEncoder()
y = y_encoder.fit_transform(DATASET['group'])

# 3. behavioral outputs (inverse efficiency scores in millis)
y_beh = DATASET['inverse_efficiency_score_ms'].values

# 4. remove subjects with missing connectivity and behavioral data
valid_subjects_mask = ~np.isnan(y_beh)
X = X[valid_subjects_mask]
y = y[valid_subjects_mask]
y_beh = y_beh[valid_subjects_mask]

subjects = DATASET['subject'][valid_subjects_mask]
# 4. Feature extraction
# X_thresholds = np.median(X, axis=1) + np.std(X, axis=1)
# X_binarized = np.where(np.abs(X) < X_thresholds.reshape(-1,1), 0., 1.)
# X_reduced = UMAP(n_neighbors=15, n_components=2).fit_transform(X)

In [None]:
g = sns.displot(
  data = DATASET[['group','inverse_efficiency_score_ms']].to_dataframe(),
  x='inverse_efficiency_score_ms', hue='group',
  binwidth=200,
  kde=True,rug=True,
)

g.set(title='Inverse efficiency score (200 milliseconds bins)',
      xlabel='Inverse efficiency score (ms)',)
plt.show()

In [None]:
# DEBUG: plot dimensionality-reduced inputs, colored by outputs

# 1. select
X_reduced = X
# X_reduced = feature_selection.SelectKBest(
  # feature_selection.f_regression, k=10).fit_transform(X, y_beh)

# 2. reduce
# X_reduced = UMAP(n_neighbors=3, n_components=3).fit_transform(X_reduced, y_beh)
# X_reduced = decomposition.PCA(n_components=3).fit_transform(X_reduced)
X_reduced = feature_selection.SelectKBest(feature_selection.f_classif, k=10).fit_transform(X_reduced, y)
X_reduced, _ = cross_decomposition.PLSRegression(n_components=3).fit_transform(X_reduced, y_beh)

# 3. plot
plotting_data = pd.DataFrame(X_reduced, columns=['dim1','dim2','dim3'])
plotting_data['group'] = y_encoder.inverse_transform(y)
plotting_data['subject'] = subjects

fig = px.scatter_3d(
  plotting_data,
  x='dim1', y='dim2', z='dim3', color='group', hover_name='subject',
  title='UMAP embedding of the upper triangle connectivity matrices.'
        'Points represent subjects.')

fig.show()