In [17]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [19]:
## concat ABIDE1 and ABIDE2
dataset1 = '/neurospin/dico/data/deep_folding/current/datasets/abide1/'
dataset2 = '/neurospin/dico/data/deep_folding/current/datasets/abide2/'
savedir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/'

In [20]:
regions1 = os.listdir(f'{dataset1}/crops/2mm/')
regions2 = os.listdir(f'{dataset2}/crops/2mm/')
regions = set(regions1).intersection(set(regions2))
regions = list(regions)
print(f'N regions : 1 : {len(regions1)}, 2 : {len(regions2)}, 3 : {len(regions)}')

N regions : 1 : 31, 2 : 31, 3 : 31


In [24]:
for region in tqdm(regions):
    for side in ['L', 'R']:

        # load data from region 1 and 2
        subs1 = pd.read_csv(f'{dataset1}/crops/2mm/{region}/mask/{side}skeleton_subject.csv')
        subs2 = pd.read_csv(f'{dataset2}/crops/2mm/{region}/mask/{side}skeleton_subject.csv')
        skels1 = np.load(f'{dataset1}/crops/2mm/{region}/mask/{side}skeleton.npy')
        skels2 = np.load(f'{dataset2}/crops/2mm/{region}/mask/{side}skeleton.npy')

        subs = pd.concat([subs1, subs2], axis=0)
        skels = np.concatenate([skels1, skels2], axis=0)

        # save data
        # create folder if does not exist
        os.makedirs(f'{savedir}/crops/2mm/{region}/mask/', exist_ok=True)
        subs.to_csv(f'{savedir}/crops/2mm/{region}/mask/{side}skeleton_subject.csv', index=False)
        np.save(f'{savedir}/crops/2mm/{region}/mask/{side}skeleton.npy', skels)

100%|██████████| 31/31 [06:44<00:00, 13.06s/it]


In [None]:
# copy the masks
for region in tqdm(regions):
    for side in ['L', 'R']:
        mask_dir = f'{savedir}/crops/2mm/{region}/mask/{side}mask_cropped.nii.gz.minf'
        # use bash
        cp = f'cp /neurospin/dico/data/deep_folding/current/datasets/UkBioBank40/crops/2mm/{region}/mask/{side}mask_cropped.nii.gz.minf {mask_dir}'
        os.system(cp)

100%|██████████| 31/31 [00:03<00:00, 10.09it/s]


In [None]:
# symlinks of the Rcrops / Lcrops

format the splits

In [25]:
splits = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/asd_age_sex_diagnosis_site_stratified_10-fold.csv')

In [None]:
## create the train / val
for k in range(10):
    subs = splits.loc[splits[f'fold-{k}'] == 'validation']['participant_id']
    subs.to_csv(f'{savedir}/splits/train_val_split_{k}.csv', index=False, header=None)

In [None]:
# test inter
subs = splits.loc[splits['set']=='internal_test']['participant_id']
subs.to_csv(f'{savedir}/splits/internal_test.csv', index=False, header=None)

In [44]:
# test_exter
subs = splits.loc[splits['set']=='external_test']['participant_id']
subs.to_csv(f'{savedir}/splits/external_test.csv', index=False, header=None)

# same with N datasets

In [18]:
## bipolar
datasets = ['cnp', 'bsnip1', 'biobd', 'candi', 'ACCpatterns']
datasets = ['/neurospin/dico/data/deep_folding/current/datasets/' + d for d in datasets]
savedir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_bipolar/'
splits_dir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_bipolar/bd_age_sex_diagnosis_site_stratified_10-fold.csv'

In [19]:
## schizophrenia
datasets = ['cnp', 'bsnip1', 'candi', 'schizconnect-vip-prague']
datasets = ['/neurospin/dico/data/deep_folding/current/datasets/' + d for d in datasets]
savedir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/'
splits_dir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/scz_age_sex_diagnosis_site_stratified_10-fold.csv'

In [20]:
datasets

['/neurospin/dico/data/deep_folding/current/datasets/cnp',
 '/neurospin/dico/data/deep_folding/current/datasets/bsnip1',
 '/neurospin/dico/data/deep_folding/current/datasets/candi',
 '/neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague']

In [24]:
regions = [os.listdir(f'{dataset}/crops/2mm/') for dataset in datasets]
for region, dataset in zip(regions, datasets): 
    print(f'N regions in {dataset} : {len(region)}')
    print(f'Intersection with {datasets[0]} : {len(set(region).intersection(set(regions[0])))}')

N regions in /neurospin/dico/data/deep_folding/current/datasets/cnp : 30
Intersection with /neurospin/dico/data/deep_folding/current/datasets/cnp : 30
N regions in /neurospin/dico/data/deep_folding/current/datasets/bsnip1 : 30
Intersection with /neurospin/dico/data/deep_folding/current/datasets/cnp : 30
N regions in /neurospin/dico/data/deep_folding/current/datasets/candi : 30
Intersection with /neurospin/dico/data/deep_folding/current/datasets/cnp : 30
N regions in /neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague : 30
Intersection with /neurospin/dico/data/deep_folding/current/datasets/cnp : 30


In [7]:
regions = regions[0] # since all databases have the same regions

In [37]:
regions = ['S.T.s.', 'F.Coll.-S.Rh.', 'S.T.i.-S.T.s.-S.T.pol.']

In [38]:
for region in tqdm(regions):
    for side in ['L', 'R']:
        subs_list = []
        skels_list = []
        
        # create folder if does not exist
        os.makedirs(f'{savedir}/crops/2mm/{region}/mask/{side}crops', exist_ok=True)
        
        for dataset in datasets:
            # load data from region 1 and 2
            subs = pd.read_csv(f'{dataset}/crops/2mm/{region}/mask/{side}skeleton_subject.csv')
            skels = np.load(f'{dataset}/crops/2mm/{region}/mask/{side}skeleton.npy')
            subs_list.append(subs)
            skels_list.append(skels)

            # create symlink of Lcrops / Rcrops for each sub
            for sub in subs.Subject:
                if not os.path.isfile(f'{savedir}/crops/2mm/{region}/mask/{side}crops/{sub}_cropped_skeleton.nii.gz'):
                    os.symlink(f'{dataset}/crops/2mm/{region}/mask/{side}crops/{sub}_cropped_skeleton.nii.gz' , f'{savedir}/crops/2mm/{region}/mask/{side}crops/{sub}_cropped_skeleton.nii.gz')
                    os.symlink(f'{dataset}/crops/2mm/{region}/mask/{side}crops/{sub}_cropped_skeleton.nii.gz.minf' , f'{savedir}/crops/2mm/{region}/mask/{side}crops/{sub}_cropped_skeleton.nii.gz.minf')

        # concatenate all data
        subs = pd.concat(subs_list, axis=0)
        skels = np.concatenate(skels_list, axis=0)

        # save data
        # create folder if does not exist
        #os.makedirs(f'{savedir}/crops/2mm/{region}/mask/', exist_ok=True)
        subs.to_csv(f'{savedir}/crops/2mm/{region}/mask/{side}skeleton_subject.csv', index=False)
        np.save(f'{savedir}/crops/2mm/{region}/mask/{side}skeleton.npy', skels)

  0%|          | 0/3 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/S.T.s/mask/Lskeleton_subject.csv'

In [30]:
# copy the masks
for region in tqdm(regions):
    for side in ['L', 'R']:
        mask_dir = f'{savedir}/crops/2mm/{region}/mask/{side}mask_cropped.nii.gz.minf'
        # use bash
        cp = f'cp /neurospin/dico/data/deep_folding/current/datasets/UkBioBank40/crops/2mm/{region}/mask/{side}mask_cropped.nii.gz.minf {mask_dir}'
        os.system(cp)

100%|██████████| 30/30 [00:04<00:00,  6.41it/s]


In [26]:
splits = pd.read_csv(splits_dir)

In [27]:
## create the train / val
for k in range(10):
    subs = splits.loc[splits[f'fold-{k}'] == 'validation']['participant_id']
    print(len(subs))
    subs.to_csv(f'{savedir}/splits/train_val_split_{k}.csv', index=False, header=None)

104
105
104
105
105
104
104
105
104
104


In [28]:
# test inter
subs = splits.loc[splits['set']=='internal_test']['participant_id']
print(len(subs))
subs.to_csv(f'{savedir}/splits/internal_test.csv', index=False, header=None)

118


In [29]:
# test_exter
subs = splits.loc[splits['set']=='external_test']['participant_id']
print(len(subs))
subs.to_csv(f'{savedir}/splits/external_test.csv', index=False, header=None)

130
