In [1]:
import numpy as np
import sparse
import os
import pandas as pd
from tqdm import tqdm
import yaml
import copy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
sides = ['L']
"""
sulcus_list = ['F.Coll.-S.Rh.', 'S.F.median-S.F.pol.tr.-S.F.sup.', 'S.F.inf.-BROCA-S.Pe.C.inf.', \
               'S.Po.C.', 'fronto-parietal_medial_face.', 'F.I.P.', 'S.T.s.-S.GSM.', 'CINGULATE.', \
               'F.C.L.p.-S.GSM.', 'S.C.-S.Po.C.', 'S.F.inter.-S.F.sup.', 'F.C.M.post.-S.p.C.', \
               'S.s.P.-S.Pa.int.', 'S.Or.-S.Olf.', 'F.P.O.-S.Cu.-Sc.Cal.', 'S.F.marginal-S.F.inf.ant.', \
               'S.F.int.-F.C.M.ant.', 'S.T.i.-S.T.s.-S.T.pol.', 'S.F.int.-S.R.', 'Lobule_parietal_sup.', \
               'S.T.i.-S.O.T.lat.', 'S.Pe.C.', 'S.T.s.br.', 'Sc.Cal.-S.Li.', 'S.T.s.', 'F.C.L.p.-subsc.-F.C.L.a.-INSULA.', \
               'S.C.-sylv.', 'S.C.-S.Pe.C.', 'OCCIPITAL', 'S.Or.']
"""
sulcus_list = ['S.Or.']

BEWARE: we here assume that the skeletons, foldlabels and distbottoms are in the same order in their respective numpy arrays.

In [10]:
rotated = '_rotated'
#rotated = ''
root_save_dir = f'/volatile/jl277509/data/UkBioBank/crops/{rotated}/2mm/'

In [11]:
# TODO: avoid copy paste between modalities
for sulcus in tqdm(sulcus_list):
    for side in sides:

        data_dir = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/crops/2mm/{sulcus}/mask'
        subjects = pd.read_csv(os.path.join(data_dir, f'{side}skeleton_subject.csv'))
        subjects = subjects['Subject'].tolist()
        skels = np.load(os.path.join(data_dir, f'{side}skeleton{rotated}.npy'))

        # need to make sure that skels, foldlabels, and distbottoms have the same coordinates
        foldlabels = np.load(os.path.join(data_dir, f'{side}label{rotated}.npy'))
        foldlabels[skels==0]=0 # TODO: should be assert instead ?

        distbottoms = np.load(os.path.join(data_dir, f'{side}distbottom{rotated}.npy'))
        distbottoms[distbottoms==0]=-1
        distbottoms[skels==0]=0

        extremities = np.load(os.path.join(data_dir, f'{side}extremities{rotated}.npy'))
        extremities[skels==0]=0
        extremities[np.logical_and(skels!=0, extremities)]=-1

        save_dir = f'{root_save_dir}/{sulcus}/mask/{side}skeleton_sparse'
        if not os.path.isdir(os.path.join(save_dir, 'coords')):
            os.makedirs(os.path.join(save_dir, 'coords'))
        if not os.path.isdir(os.path.join(save_dir, 'skeleton')):
            os.makedirs(os.path.join(save_dir, 'skeleton'))
        if not os.path.isdir(os.path.join(save_dir, 'foldlabel')):
            os.makedirs(os.path.join(save_dir, 'foldlabel'))
        if not os.path.isdir(os.path.join(save_dir, 'distbottom')):
            os.makedirs(os.path.join(save_dir, 'distbottom'))
        if not os.path.isdir(os.path.join(save_dir, 'extremities')):
            os.makedirs(os.path.join(save_dir, 'extremities'))

        nb_subs = len(skels)
        for k, subject in enumerate(subjects):
            skel = skels[k,:,:,:,0]
            s = sparse.COO.from_numpy(skel)
            np.save(os.path.join(save_dir, f'coords/{side}{subject}_coords.npy'), s.coords)
            np.save(os.path.join(save_dir, f'skeleton/{side}{subject}_skeleton_values.npy'), s.data)
            fold = foldlabels[k,:,:,:,0]
            s = sparse.COO.from_numpy(fold)
            np.save(os.path.join(save_dir, f'foldlabel/{side}{subject}_foldlabel_values.npy'), s.data)
            distb = distbottoms[k,:,:,:,0]
            s = sparse.COO.from_numpy(distb)
            np.save(os.path.join(save_dir, f'distbottom/{side}{subject}_distbottom_values.npy'), s.data)
            extr = extremities[k,:,:,:,0]
            s = sparse.COO.from_numpy(extr)
            np.save(os.path.join(save_dir, f'extremities/{side}{subject}_extremities_values.npy'), s.data)

100%|██████████| 1/1 [00:25<00:00, 25.51s/it]


In [6]:
# check that distbottoms are consistent before and after conversion to sparse
# DONE
side = 'R'
sulcus = 'CINGULATE.'
root_save_dir = '/volatile/jl277509/data/UkBioBank/crops/2mm/'
data_dir = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/crops/2mm/{sulcus}/mask'
subjects = pd.read_csv(os.path.join(data_dir, f'{side}skeleton_subject.csv'))
coords_dir = f'{root_save_dir}{sulcus}/mask/{side}skeleton_sparse/coords'
distbottom_dir = f'{root_save_dir}{sulcus}/mask/{side}skeleton_sparse/distbottom'
foldlabel_dir =  f'{root_save_dir}{sulcus}/mask/{side}skeleton_sparse/foldlabel'

In [5]:
def convert_sparse_to_numpy(data, coords, input_size, dtype):
    """
    Convert coords and associated values to numpy array
    """
    s = sparse.COO(coords, data, shape=input_size)
    arr = s.todense()
    arr = np.expand_dims(arr, axis=-1)
    arr = arr.astype(dtype)

    return arr

check that the sparse distbottoms can be properly reconstructed

In [5]:
distbottoms = np.load(os.path.join(data_dir, f'{side}distbottom.npy'))

In [10]:
l = []
for sub in tqdm(subjects.Subject):
    distbottom_arr = np.load(os.path.join(distbottom_dir, f'R{sub}_distbottom_values.npy'))
    coords_arr = np.load(os.path.join(coords_dir, f'R{sub}_coords.npy'))
    sample_distbottoms = convert_sparse_to_numpy(distbottom_arr, coords_arr,
                                                 [16,37,37], 'int32')
    sample_distbottoms[sample_distbottoms==0]=32500
    sample_distbottoms[sample_distbottoms==-1]=0
    l.append(sample_distbottoms)

In [16]:
distbottoms_reconstructed = np.stack(l, axis=0)

In [25]:
# ignore 32500 and 32501 values since they are the same : set them to same value
distbottoms[distbottoms==32500]=32501
distbottoms_reconstructed[distbottoms_reconstructed==32500]=32501

In [26]:
diff = distbottoms-distbottoms_reconstructed
np.sum(diff)

0

same for foldlabel ?

In [8]:
foldlabels = np.load(os.path.join(data_dir, f'{side}label.npy'))

In [9]:
l = []
for sub in tqdm(subjects.Subject):
    foldlabel_arr = np.load(os.path.join(foldlabel_dir, f'R{sub}_foldlabel_values.npy'))
    coords_arr = np.load(os.path.join(coords_dir, f'R{sub}_coords.npy'))
    sample_foldlabels = convert_sparse_to_numpy(foldlabel_arr, coords_arr,
                                                 [16,37,37], 'int32')
    l.append(sample_foldlabels)

100%|██████████| 21045/21045 [00:11<00:00, 1812.03it/s]


In [10]:
foldlabels_reconstructed = np.stack(l, axis=0)

In [11]:
diff = foldlabels-foldlabels_reconstructed
np.sum(diff)

0

same for skeletons ?

In [2]:
# create yamls

# start from an existing config
starting_sulcus = 'S.Or.' # NEED TO WRITE THE SOr RIGHT AND LEFT CONFIG MANUALLY FIRST
rotated = '_rotated'
rotated = ''
no_dot_starting_sulcus = ''.join(starting_sulcus.split('.'))
config_path = f'/volatile/jl277509/Runs/02_STS_babies/Program/2023_jlaval_STSbabies/contrastive/configs/dataset/julien/sparse_load/{rotated}'

sides = {'right': 'R', 'left': 'L'}
for side, side_abrev in sides.items():
    with open(os.path.join(config_path, f'{no_dot_starting_sulcus}_{side}_UKB_sparse_load.yaml'), 'r') as file:
        starting_config = yaml.safe_load(file)

    sulcus_list = ['F.Coll.-S.Rh.', 'S.F.inf.-BROCA-S.Pe.C.inf.', \
                'S.Po.C.', 'fronto-parietal_medial_face.', 'S.T.s.-S.GSM.', \
                'F.C.L.p.-S.GSM.', 'S.C.-S.Po.C.', 'S.F.inter.-S.F.sup.', 'F.C.M.post.-S.p.C.', \
                'S.s.P.-S.Pa.int.', 'S.Or.-S.Olf.', 'F.P.O.-S.Cu.-Sc.Cal.', 'S.F.marginal-S.F.inf.ant.', \
                'S.F.int.-F.C.M.ant.', 'S.T.i.-S.T.s.-S.T.pol.', 'S.F.int.-S.R.', 'Lobule_parietal_sup.', \
                'S.T.i.-S.O.T.lat.', 'S.Pe.C.', 'S.T.s.br.', 'Sc.Cal.-S.Li.', 'S.T.s.', 'F.C.L.p.-subsc.-F.C.L.a.-INSULA.', \
                'S.C.-S.Pe.C.', 'OCCIPITAL']

    for target_sulcus in sulcus_list:
        # dataset name
        no_dot_target_sulcus = ''.join(target_sulcus.split('.'))
        target_config = copy.deepcopy(starting_config)
        target_config['dataset_name']=f'{no_dot_target_sulcus}_{side}_UKB_sparse_load'
        # shape
        filename = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/crops/2mm/{target_sulcus}/mask/{side_abrev}skeleton{rotated}.npy'
        mmapped_array = np.load(filename, mmap_mode='r')
        shape = tuple([1] + list(mmapped_array.shape[1:4]))
        print(f'{target_sulcus} size: {np.prod(shape)} vx')
        shape = str(shape)
        target_config['input_size']=shape
        # dirs
        for key, value in starting_config.items():
            if isinstance(value, str):
                if starting_sulcus in value:
                    new_value = value.replace(starting_sulcus, target_sulcus)
                    target_config[key]=new_value

        save_path = os.path.join(config_path, f'{no_dot_target_sulcus}_{side}_UKB_sparse_load.yaml')
        with open(save_path, 'w') as outfile:
            outfile.write(f'# @package dataset.{no_dot_target_sulcus}_{side}_UKB_sparse_load\n')
            yaml.dump(target_config,
                    outfile,
                    sort_keys=False)

F.Coll.-S.Rh. size: 86428 vx
S.F.inf.-BROCA-S.Pe.C.inf. size: 44720 vx
S.Po.C. size: 57498 vx
fronto-parietal_medial_face. size: 114144 vx
S.T.s.-S.GSM. size: 63648 vx
F.C.L.p.-S.GSM. size: 52245 vx
S.C.-S.Po.C. size: 88494 vx
S.F.inter.-S.F.sup. size: 82150 vx
F.C.M.post.-S.p.C. size: 24769 vx
S.s.P.-S.Pa.int. size: 33626 vx
S.Or.-S.Olf. size: 28272 vx
F.P.O.-S.Cu.-Sc.Cal. size: 60060 vx
S.F.marginal-S.F.inf.ant. size: 44928 vx
S.F.int.-F.C.M.ant. size: 52326 vx
S.T.i.-S.T.s.-S.T.pol. size: 116424 vx
S.F.int.-S.R. size: 57171 vx
Lobule_parietal_sup. size: 88560 vx
S.T.i.-S.O.T.lat. size: 110448 vx
S.Pe.C. size: 76000 vx
S.T.s.br. size: 44688 vx
Sc.Cal.-S.Li. size: 48048 vx
S.T.s. size: 63648 vx
F.C.L.p.-subsc.-F.C.L.a.-INSULA. size: 83028 vx
S.C.-S.Pe.C. size: 100674 vx
OCCIPITAL size: 48300 vx


In [71]:
# same with 5000 subjects

# create yamls

# start from an existing config
# right side !
starting_sulcus = 'S.F.median-S.F.pol.tr.-S.F.sup.'
no_dot_starting_sulcus = ''.join(starting_sulcus.split('.'))
config_path = '/volatile/jl277509/Runs/02_STS_babies/Program/2023_jlaval_STSbabies/contrastive/configs/dataset/julien/sparse_load/5000_subjects'
with open(os.path.join(config_path, f'{no_dot_starting_sulcus}_right_UKB_sparse_load_5000.yaml'), 'r') as file:
    starting_config = yaml.safe_load(file)

sulcus_list = ['F.Coll.-S.Rh.', 'S.F.inf.-BROCA-S.Pe.C.inf.', \
               'S.Po.C.', 'fronto-parietal_medial_face.', 'F.I.P.', 'S.T.s.-S.GSM.', 'CINGULATE.', \
               'F.C.L.p.-S.GSM.', 'S.C.-S.Po.C.', 'S.F.inter.-S.F.sup.', 'F.C.M.post.-S.p.C.', \
               'S.s.P.-S.Pa.int.', 'S.Or.-S.Olf.', 'F.P.O.-S.Cu.-Sc.Cal.', 'S.F.marginal-S.F.inf.ant.', \
               'S.F.int.-F.C.M.ant.', 'S.T.i.-S.T.s.-S.T.pol.', 'S.F.int.-S.R.', 'Lobule_parietal_sup.', \
               'S.T.i.-S.O.T.lat.', 'S.Pe.C.', 'S.T.s.br.', 'Sc.Cal.-S.Li.', 'S.T.s.', 'F.C.L.p.-subsc.-F.C.L.a.-INSULA.', \
               'S.C.-sylv.', 'S.C.-S.Pe.C.', 'OCCIPITAL', 'S.Or.']

for target_sulcus in sulcus_list:
    # dataset name
    no_dot_target_sulcus = ''.join(target_sulcus.split('.'))
    target_config = copy.deepcopy(starting_config)
    target_config['dataset_name']=f'{no_dot_target_sulcus}_right_UKB_sparse_load_5000'
    # shape
    filename = f'/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/crops/2mm/{target_sulcus}/mask/Rskeleton.npy'
    mmapped_array = np.load(filename, mmap_mode='r')
    shape = tuple([1] + list(mmapped_array.shape[1:4]))
    print(f'{target_sulcus} size: {np.prod(shape)} vx')
    shape = str(shape)
    target_config['input_size']=shape
    # dirs
    for key, value in starting_config.items():
        if isinstance(value, str):
            if starting_sulcus in value:
                new_value = value.replace(starting_sulcus, target_sulcus)
                target_config[key]=new_value

    save_path = os.path.join(config_path, f'{no_dot_target_sulcus}_right_UKB_sparse_load_5000.yaml')
    with open(save_path, 'w') as outfile:
        yaml.dump(target_config,
                  outfile,
                  sort_keys=False)

F.Coll.-S.Rh. size: 86428 vx
S.F.inf.-BROCA-S.Pe.C.inf. size: 44720 vx
S.Po.C. size: 57498 vx
fronto-parietal_medial_face. size: 114144 vx
F.I.P. size: 77220 vx
S.T.s.-S.GSM. size: 63648 vx
CINGULATE. size: 21904 vx
F.C.L.p.-S.GSM. size: 52245 vx
S.C.-S.Po.C. size: 88494 vx
S.F.inter.-S.F.sup. size: 82150 vx
F.C.M.post.-S.p.C. size: 24769 vx
S.s.P.-S.Pa.int. size: 33626 vx
S.Or.-S.Olf. size: 28272 vx
F.P.O.-S.Cu.-Sc.Cal. size: 60060 vx
S.F.marginal-S.F.inf.ant. size: 44928 vx
S.F.int.-F.C.M.ant. size: 52326 vx
S.T.i.-S.T.s.-S.T.pol. size: 116424 vx
S.F.int.-S.R. size: 57171 vx
Lobule_parietal_sup. size: 88560 vx
S.T.i.-S.O.T.lat. size: 110448 vx
S.Pe.C. size: 76000 vx
S.T.s.br. size: 44688 vx
Sc.Cal.-S.Li. size: 48048 vx
S.T.s. size: 63648 vx
F.C.L.p.-subsc.-F.C.L.a.-INSULA. size: 83028 vx
S.C.-sylv. size: 69972 vx
S.C.-S.Pe.C. size: 100674 vx
OCCIPITAL size: 48300 vx
S.Or. size: 20332 vx
