In [135]:
import pandas as pd
import numpy as np
import os

In [136]:
hcp_full = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/hcp/full_subjects.csv', header=None)
hcp_full.columns=['Subject']

Subsample 500 subs from UKB, 250 for each sex

In [137]:
hcp = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/hcp/participants.csv")
hcp_gender = hcp[['Subject', 'Gender']]
hcp_gender = hcp_gender.merge(hcp_full, on='Subject')
hcp_family = pd.read_csv("/neurospin/dico/jchavas/RESTRICTED_jchavas_1_18_2022_3_17_51.csv")
hcp_family=hcp_family.groupby(by='Family_ID').first()
hcp_family = hcp_family.merge(hcp_gender, on='Subject')
hcp_family = hcp_family[['Subject', 'Gender']]

In [138]:
hcp_family

Unnamed: 0,Subject,Gender
0,257946,F
1,571144,M
2,213017,M
3,589567,M
4,213421,F
...,...,...
419,618952,M
420,650746,M
421,516742,M
422,114823,F


In [139]:
print('nb M', len(hcp_family[hcp_family['Gender']=='M']))
print('nb F', len(hcp_family[hcp_family['Gender']=='F']))

nb M 196
nb F 228


In [140]:
# select 196 of each sex in both datasets
nb_subs=196
hcp_subsample_M = hcp_family.loc[hcp_family['Gender']=='M']
hcp_subsample_F = (hcp_family.loc[hcp_family['Gender']=='F']).sample(nb_subs, random_state=0)
hcp_subsample = pd.concat((hcp_subsample_M, hcp_subsample_F))
hcp_subsample = hcp_subsample['Subject'].tolist()

In [141]:
ukb_sex = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/participants_sex_age.csv")
ukb_sex.columns = ['Subject', 'Sex', 'Age']
ukb_subs = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/crops/2mm/ORBITAL/mask/Lskeleton_subject.csv")
ukb = ukb_subs.merge(ukb_sex, on='Subject')
# select the youngest UKB subs
ukb = ukb.sort_values(by=['Age'])
ukb_1 = ukb.loc[ukb['Sex']==0].iloc[:nb_subs]['Subject'].tolist()
ukb_2 = ukb.loc[ukb['Sex']==1].iloc[:nb_subs]['Subject'].tolist()
ukb_subsample = ukb_1+ukb_2

In [142]:
df_subs = pd.DataFrame({'Subject': ukb_subsample + hcp_subsample})

In [143]:
df_label = df_subs.copy()
df_label['Site'] = [0 for k in range(2*nb_subs)] + [1 for k in range(2*nb_subs)]

In [144]:
save_dir='/neurospin/dico/data/deep_folding/current/datasets/site_effect_ukb_hcp/'

In [145]:
df_subs.to_csv(os.path.join(save_dir, 'subjects.csv'), index=None, header=None)
df_label.to_csv(os.path.join(save_dir, 'labels.csv'), index=None)

In [147]:
## create symlinks
sides = ['L', 'R']
resolutions=['raw', '2mm']
resampled_args = {'raw': ['skeleton_generated', 'foldlabel_generated'],
                 '2mm': ['resampled_skeleton', 'resampled_foldlabel']} ### CHANGE ORDER : RAW 2MM INSTEAD OF SKELETON FOLDLABEL
for side in sides:
    for resolution in resolutions:
        for datatype, dataname in zip([f'skeletons/{resolution}', f'foldlabels/{resolution}'], resampled_args[resolution]):
            target = f'/neurospin/dico/data/deep_folding/current/datasets/site_effect_ukb_hcp/{datatype}/{side}'
            if not os.path.isdir(target):
                os.makedirs(target)
            ukb_skels_dir = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/{datatype}/{side}'
            for subject in ukb_subsample:
                filename_d = f'{side}{dataname}_{subject}.nii.gz'
                d = os.path.join(ukb_skels_dir, filename_d)
                target_d = os.path.join(target, filename_d)
                filename_d_minf = f'{side}{dataname}_{subject}.nii.gz.minf'
                d_minf = os.path.join(ukb_skels_dir, filename_d_minf)
                target_d_minf = os.path.join(target, filename_d_minf)
                os.symlink(d, target_d)
                os.symlink(d_minf, target_d_minf)
            hcp_skels_dir = f'/neurospin/dico/data/deep_folding/current/datasets/hcp/{datatype}/{side}'
            for subject in hcp_subsample:
                filename_d = f'{side}{dataname}_{subject}.nii.gz'
                d = os.path.join(hcp_skels_dir, filename_d)
                target_d = os.path.join(target, filename_d)
                filename_d_minf = f'{side}{dataname}_{subject}.nii.gz.minf'
                d_minf = os.path.join(hcp_skels_dir, filename_d_minf)
                target_d_minf = os.path.join(target, filename_d_minf)
                os.symlink(d, target_d)
                os.symlink(d_minf, target_d_minf)

In [98]:
# same with transforms
side = 'R'
target = f'/neurospin/dico/data/deep_folding/current/datasets/site_effect_ukb_hcp/transforms/{side}'
ukb_skels_dir = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/transforms/{side}'
for subject in ukb_subsample:
    filename_d = f'{side}transform_to_ICBM2009c_{subject}.trm'
    d = os.path.join(ukb_skels_dir, filename_d)
    target_d = os.path.join(target, filename_d)
    filename_d_minf = filename_d + '.minf'
    d_minf = os.path.join(ukb_skels_dir, filename_d_minf)
    target_d_minf = os.path.join(target, filename_d_minf)
    os.symlink(d, target_d)
    os.symlink(d_minf, target_d_minf)
hcp_skels_dir = f'/neurospin/dico/data/deep_folding/current/datasets/hcp/transforms/{side}'
for subject in hcp_subsample:
    filename_d = f'{side}transform_to_ICBM2009c_{subject}.trm'
    d = os.path.join(ukb_skels_dir, filename_d)
    target_d = os.path.join(target, filename_d)
    filename_d_minf = filename_d + '.minf'
    d_minf = os.path.join(ukb_skels_dir, filename_d_minf)
    target_d_minf = os.path.join(target, filename_d_minf)
    os.symlink(d, target_d)
    os.symlink(d_minf, target_d_minf)