In [49]:
import pandas as pd
import os.path as osp
from pathlib import Path
import numpy as np
import os
from scipy.io import loadmat
from collections import defaultdict
from sklearn.model_selection import StratifiedGroupKFold
import shutil
from sklearn.model_selection import train_test_split

In [7]:
global_dir ="../datasets/hcp/fmri/raw/schaefer_old"

In [32]:
new_dir ="../datasets/hcp/fmri/raw/schaefer"

In [8]:
data = os.listdir(global_dir)

In [9]:
subj_id_col ='SUB_ID'
fmri_id_col ='FILE_ID'
target_col ='DX_GROUP'

In [10]:
def load_targets(path):
        """ Load and process *cobre* targets """

        path = Path(path)
        targets =[]
        for p in path.glob("*.csv"):
            if not p.stem.endswith("_embed"):
                array_splits =p.stem.split("_")
                sub_id = array_splits[0].replace("sub-", "")
                targets.append((p.stem,sub_id,array_splits[2]))
        
        target =pd.DataFrame.from_records(targets, columns=[fmri_id_col, subj_id_col, target_col])
        # label encoding
        label2idx: dict[str, int] = {x: i for i, x in enumerate(target[target_col].unique())}
        idx2label: dict[int, str] = {i: x for x, i in label2idx.items()}
        target[target_col] = target[target_col].map(label2idx)

        return target, label2idx, idx2label

In [11]:
def load_cms(
        path) :

        """Load connectivity matrices, fMRI time series
        and mapping node idx -> ROI name.

        Maps sibj_id to CM and ts
        """

        path = Path(path)

        data = {}
        time_series = {}
        # ROI names, extacted from CMs
        roi_map: dict[int, str] = {}
        
        for p in path.glob("*.csv"):
            name = p.stem
            x = pd.read_csv(p)

            values = x.values.astype(np.float32)
            if p.stem.endswith("_embed"):
                time_series[name] = values
            else:
                data[name] = values
                if not roi_map:
                    roi_map = dict(enumerate(x.columns))

        return data, time_series, roi_map

In [12]:
target, label2idx, idx2label =load_targets(global_dir)

In [93]:
unique_ids =target[:127]['FILE_ID'].to_list()

In [94]:
count =0
for elem in os.listdir(global_dir):
    name = Path(elem).stem
    name =name.replace("_embed", "")
    if name in unique_ids:
        print(Path(global_dir) / elem)
        print(Path(new_dir) / elem)
        shutil.copy(Path(global_dir) / elem, Path(new_dir) / elem )
        count+=1

../datasets/hcp/fmri/raw/schaefer_old/182739_tfMRI_MOTOR_LR.csv
../datasets/hcp/fmri/raw/schaefer/182739_tfMRI_MOTOR_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/517239_tfMRI_LANGUAGE_LR.csv
../datasets/hcp/fmri/raw/schaefer/517239_tfMRI_LANGUAGE_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/859671_tfMRI_SOCIAL_LR.csv
../datasets/hcp/fmri/raw/schaefer/859671_tfMRI_SOCIAL_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/189450_tfMRI_RELATIONAL_LR.csv
../datasets/hcp/fmri/raw/schaefer/189450_tfMRI_RELATIONAL_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/114217_tfMRI_EMOTION_LR.csv
../datasets/hcp/fmri/raw/schaefer/114217_tfMRI_EMOTION_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/127933_tfMRI_RELATIONAL_LR.csv
../datasets/hcp/fmri/raw/schaefer/127933_tfMRI_RELATIONAL_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/908860_tfMRI_EMOTION_LR.csv
../datasets/hcp/fmri/raw/schaefer/908860_tfMRI_EMOTION_LR.csv
../datasets/hcp/fmri/raw/schaefer_old/136732_tfMRI_RELATIONAL_LR.csv
../datasets/hcp/fmri/raw/sc

In [95]:
count

254

In [9]:
target['SUB_ID'].unique().shape

(936,)

In [None]:
data, ts, roi_map =load_cms(global_dir)

In [None]:
len(data)

In [66]:
target

Unnamed: 0,FILE_ID,SUB_ID,DX_GROUP
0,182739_tfMRI_MOTOR_LR,182739,0
1,517239_tfMRI_LANGUAGE_LR,517239,1
2,189450_tfMRI_RELATIONAL_LR,189450,2
3,114217_tfMRI_EMOTION_LR,114217,3
4,127933_tfMRI_RELATIONAL_LR,127933,2
...,...,...,...
6327,129331_tfMRI_LANGUAGE_LR,129331,1
6328,559457_tfMRI_WM_LR,559457,4
6329,198855_tfMRI_MOTOR_LR,198855,0
6330,308331_tfMRI_SOCIAL_LR,308331,6


In [84]:
def generate_splits(file_ids, subj_ids, y, seed: int = 1380):
    # split into train/test
    file_ids = np.array(file_ids)
    idx = np.arange(len(file_ids)) 
    
    train_idx, test_idx = train_test_split(
        idx, test_size=0.2, stratify=y, shuffle=True, random_state=seed,
    )

    train, y_train = subj_ids[train_idx], y[train_idx]
    test, _ = subj_ids[test_idx], y[test_idx]

    # split train into cv folds
    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=seed)
    folds: dict[str, list] = {}
    for i, (train_fold, valid_fold) in enumerate(cv.split(train, y_train, subj_ids[train_idx])):
        folds[i]={
            'train': [str(k) for k in list(train.iloc[train_fold])],
            'valid': [str(k) for k in list(train.iloc[valid_fold])],
        }
    folds['test'] = [str(k) for k in list(test)]

    return folds

In [85]:
target[:320]['DX_GROUP']

0      0
1      1
2      2
3      3
4      4
      ..
315    3
316    4
317    1
318    1
319    0
Name: DX_GROUP, Length: 320, dtype: int64

In [86]:
target[:320]

Unnamed: 0,FILE_ID,SUB_ID,DX_GROUP
0,182739_tfMRI_MOTOR_LR,182739,0
1,517239_tfMRI_LANGUAGE_LR,517239,1
2,859671_tfMRI_SOCIAL_LR,859671,2
3,189450_tfMRI_RELATIONAL_LR,189450,3
4,114217_tfMRI_EMOTION_LR,114217,4
...,...,...,...
315,211316_tfMRI_RELATIONAL_LR,211316,3
316,899885_tfMRI_EMOTION_LR,899885,4
317,390645_tfMRI_LANGUAGE_LR,390645,1
318,107018_tfMRI_LANGUAGE_LR,107018,1


In [96]:
new_folds =generate_splits(target[:127]['FILE_ID'], target[:127]['FILE_ID'], target[:127]['DX_GROUP'])

In [97]:
import json
with open('../datasets/hcp/hcp_splits.json', 'w') as f:
    json.dump(new_folds, f)