In [1]:
from src.features.discriminability import discr_stat

import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

from scipy import stats

from pathlib import Path
import os
import re
import pandas as pd

from tqdm import tqdm

from sklearn.metrics.pairwise import euclidean_distances
from itertools import combinations

from scipy.sparse.linalg import svds

In [18]:
h5_key = 'latent'
## Define paths
basedir = Path('..')
datadir = basedir / 'data'
rawdir = datadir / 'raw'
gccadir = datadir / 'interim' / 'gcca250'

In [19]:
def get_files(path,
              level='(e|n)',
              subject='([0-9]{3})',
              task='(.+?)',
              filetype='h5',
              flag=''):
    files = []
    query = f'^{level}_sub-'
    query += f'{subject}_ses-1_'
    query += f'task-{task}{flag}\.{filetype}'
    for f in os.listdir(path):
        match = re.search(query, f)
        if match:
            files.append((f, match.groups()))
    
    return(files)

In [20]:
tasks = ['restingstate', 'openmonitoring', 'compassion']
levels = ['e', 'n']

In [21]:
## Get filenames for each task, novice vs. experienced
## Load a single set of latents

#latents_inter = {l:{t:[] for t in tasks} for l in levels}
#labels_inter = {l:{t:[] for t in tasks} for l in levels}

#latents_intra = {t:{l:[] for l in levels} for t in tasks}
#labels_intra = {t:{l:[] for l in levels} for t in tasks}

latents = []; labels_lt = []; labels_l = []; labels_t = []

n_components = 3

for level in levels:
    for task in tasks:
        paths = get_files(path=gccadir, level=level, task=task, flag='_gcca')
        
        n_load = len(paths)

        for path,subj in tqdm(paths[:n_load]):
            h5f = h5py.File(gccadir / path,'r')
            latent = h5f[h5_key][:][:,:n_components]
            h5f.close()
            
            latents.append(latent)
            labels_lt.append(f'{level}_{task}')
            labels_l.append(level)
            labels_t.append(task)
            

labels_lt = np.array(labels_lt)
labels_t = np.array(labels_t)
labels_l = np.array(labels_l)
latents = np.array(latents)

100%|██████████| 29/29 [00:00<00:00, 75.11it/s]
100%|██████████| 29/29 [00:00<00:00, 81.78it/s]
100%|██████████| 29/29 [00:00<00:00, 96.69it/s]
100%|██████████| 47/47 [00:00<00:00, 80.52it/s]
100%|██████████| 47/47 [00:00<00:00, 77.78it/s]
100%|██████████| 47/47 [00:00<00:00, 68.72it/s]


## Save Distance Matrices for R

In [22]:
distancedir = datadir / 'interim' / 'gcca_distances'

In [23]:
def get_distance_matrix(X, metric):
    if metric == 'euclidean':
        return(euclidean_distances(X.reshape(X.shape[0], -1)))
    elif metric == 'spectral':
        dm = np.zeros((X.shape[0], X.shape[0]))
        for i in range(X.shape[0]):
            for j in range(i+1, X.shape[0]):
                temp = X[i] - X[j]
                _,s,_ = svds(np.array(X[i] - X[j], dtype=float), k=1)
                dm[i,j] = s[0]; dm[j,i] = s[0]
        return(dm)
    elif metric == 'pairwise':
        pass
                 
def get_save_classes(labels, level1='',task1='',level2='',task2='',task3='',metric='euclidean'):
    global latents
    ## Create search keys and get indices
    key1 = f'{level1}_{task1}'
    key2 = f'{level2}_{task2}'
    idx1 = [i for i,label in enumerate(labels) if key1 in label]
    idx2 = [i for i,label in enumerate(labels) if key2 in label]
    print(f'Len of {key1}: {len(idx1)}')
    print(f'Len of {key2}: {len(idx2)}')
    if not task3 == '':
        key3 = f'_{task3}'
        idx3 = [i for i,label in enumerate(labels) if key3 in label]
        print(f'Len of {key3}: {len(idx3)}')
        idxs = np.hstack((idx1, idx2, idx3))
        
        ## Get relevant stuff
        distances = get_distance_matrix(latents[idxs], metric=metric)
        labels2 = np.hstack((['1'] * len(idx1), ['2'] * len(idx2), ['3'] * len(idx3)))

        ## Save relevant stuff
        pd.DataFrame(distances).to_csv(distancedir / f'{key1}_{key2}_{key3}_distances.csv', header=False, index=False)
        pd.DataFrame(labels2).to_csv(distancedir / f'{key1}_{key2}_{key3}_labels.csv', header=False, index=False)
    else:
        idxs = np.hstack((idx1, idx2))
    
        ## Get relevant stuff
        distances = distances = get_distance_matrix(latents[idxs], metric=metric)
        labels2 = np.hstack((['1'] * len(idx1), ['2'] * len(idx2)))

        ## Save relevant stuff
        pd.DataFrame(distances).to_csv(distancedir / f'{key1}_{key2}_distances.csv', header=False, index=False)
        pd.DataFrame(labels2).to_csv(distancedir / f'{key1}_{key2}_labels.csv', header=False, index=False)

In [24]:
distance_metric = 'euclidean'

In [25]:
## Inter t (3)
for task in tasks:
    get_save_classes(labels=labels_lt, level1=levels[0], level2=levels[1], task1=task, task2=task, metric=distance_metric)

Len of e_restingstate: 29
Len of n_restingstate: 47
Len of e_openmonitoring: 29
Len of n_openmonitoring: 47
Len of e_compassion: 29
Len of n_compassion: 47


In [26]:
## Inter experience (2)
for level in levels:
    for t1,t2 in combinations(tasks, 2):
        get_save_classes(labels=labels_lt, level1=level, level2=level, task1=t1, task2=t2, metric=distance_metric)

Len of e_restingstate: 29
Len of e_openmonitoring: 29
Len of e_restingstate: 29
Len of e_compassion: 29
Len of e_openmonitoring: 29
Len of e_compassion: 29
Len of n_restingstate: 47
Len of n_openmonitoring: 47
Len of n_restingstate: 47
Len of n_compassion: 47
Len of n_openmonitoring: 47
Len of n_compassion: 47


In [27]:
## Pairwise (9)
## Inter experience (2)
for t1,t2 in combinations(tasks, 2):
    get_save_classes(labels=labels_lt, level1=levels[0], level2=levels[1], task1=t1, task2=t2, metric=distance_metric)
    get_save_classes(labels=labels_lt, level1=levels[0], level2=levels[1], task1=t2, task2=t1, metric=distance_metric)

Len of e_restingstate: 29
Len of n_openmonitoring: 47
Len of e_openmonitoring: 29
Len of n_restingstate: 47
Len of e_restingstate: 29
Len of n_compassion: 47
Len of e_compassion: 29
Len of n_restingstate: 47
Len of e_openmonitoring: 29
Len of n_compassion: 47
Len of e_compassion: 29
Len of n_openmonitoring: 47


In [28]:
## Novice vs. Expert (9)
get_save_classes(labels=labels_lt, level1=levels[0], level2=levels[1], metric=distance_metric)

Len of e_: 87
Len of n_: 141


In [29]:
## Inter-trait (3)
for t1,t2 in combinations(tasks, 2):
    get_save_classes(labels=labels_lt, task1=t1, task2=t2, metric=distance_metric)

Len of _restingstate: 76
Len of _openmonitoring: 76
Len of _restingstate: 76
Len of _compassion: 76
Len of _openmonitoring: 76
Len of _compassion: 76


In [30]:
## Triplet inter-trait (1)
get_save_classes(labels=labels_lt, task1=tasks[0], task2=tasks[1], task3=tasks[2], metric=distance_metric)

Len of _restingstate: 76
Len of _openmonitoring: 76
Len of _compassion: 76


In [31]:
## Experts meditation vs not
paths1 = get_files(path=gccadir, level='e', task=f'({tasks[1]}|{tasks[2]})', flag='_gcca')
paths2 = get_files(path=gccadir, level='e', task=tasks[0], flag='_gcca')

latents = []
labels = []

for path,_ in tqdm(paths1):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('1')
    
for path,_ in tqdm(paths2):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('2')
    
distances = get_distance_matrix(np.array(latents), metric=distance_metric)

## Save relevant stuff
pd.DataFrame(distances).to_csv(distancedir / f'e_meditating_e_resting_distances.csv', header=False, index=False)
pd.DataFrame(labels).to_csv(distancedir / f'e_meditating_e_resting_labels.csv', header=False, index=False)

100%|██████████| 58/58 [00:00<00:00, 139.16it/s]
100%|██████████| 29/29 [00:00<00:00, 255.50it/s]


In [32]:
## Novices meditation vs not
paths1 = get_files(path=gccadir, level='n', task=f'({tasks[1]}|{tasks[2]})', flag='_gcca')
paths2 = get_files(path=gccadir, level='n', task=tasks[0], flag='_gcca')

latents = []
labels = []

for path,_ in tqdm(paths1):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('1')
    
for path,_ in tqdm(paths2):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('2')
    
distances = get_distance_matrix(np.array(latents), metric=distance_metric)

## Save relevant stuff
pd.DataFrame(distances).to_csv(distancedir / f'n_meditating_n_resting_distances.csv', header=False, index=False)
pd.DataFrame(labels).to_csv(distancedir / f'n_meditating_n_resting_labels.csv', header=False, index=False)

100%|██████████| 94/94 [00:00<00:00, 405.69it/s]
100%|██████████| 47/47 [00:00<00:00, 473.35it/s]


In [17]:
## Experts meditating vs novices not
paths1 = get_files(path=gccadir, level='e', task=f'({tasks[1]}|{tasks[2]})', flag='_gcca')
paths2 = get_files(path=gccadir, level='n', task=tasks[0], flag='_gcca')

latents = []
labels = []

for path,_ in tqdm(paths1):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('1')
    
for path,_ in tqdm(paths2):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('2')
    
distances = get_distance_matrix(np.array(latents), metric=distance_metric)

## Save relevant stuff
pd.DataFrame(distances).to_csv(distancedir / f'e_meditating_n_resting_distances.csv', header=False, index=False)
pd.DataFrame(labels).to_csv(distancedir / f'e_meditating_n_resting_labels.csv', header=False, index=False)

100%|██████████| 58/58 [00:00<00:00, 377.63it/s]
100%|██████████| 47/47 [00:00<00:00, 310.51it/s]


In [19]:
## All meditating vs all not
paths1 = get_files(path=gccadir, level='(e|n)', task=f'({tasks[1]}|{tasks[2]})', flag='_gcca')
paths2 = get_files(path=gccadir, level='(e|n)', task=tasks[0], flag='_gcca')

latents = []
labels = []

for path,_ in tqdm(paths1):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('1')
    
for path,_ in tqdm(paths2):
    h5f = h5py.File(gccadir / path,'r')
    latent = h5f[h5_key][:][:,:n_components]
    h5f.close()
    latents.append(latent)
    labels.append('2')
    
distances = get_distance_matrix(np.array(latents), metric=distance_metric)

## Save relevant stuff
pd.DataFrame(distances).to_csv(distancedir / f'_meditating__resting_distances.csv', header=False, index=False)
pd.DataFrame(labels).to_csv(distancedir / f'_meditating__resting_labels.csv', header=False, index=False)

100%|██████████| 152/152 [00:02<00:00, 64.78it/s]
100%|██████████| 76/76 [00:01<00:00, 64.31it/s]
