In [1]:
import os
import shutil
import pandas as pd
import nilearn
import nilearn.datasets
import numpy as np

In [2]:
###TO UPDATE
# Path where the parcellated images are stored.
data_dir = '/bigdisk2/nilearn_data/'
# Retrieve the split files (train, val, test).
split_path = '../../dataset/split'
# Path to the formated dataset.
save_path = './IBC'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:
# Load the IBC dataset
data = nilearn.datasets.fetch_neurovault_ids(collection_ids=[6618], data_dir=data_dir)
data_path = os.path.join(data_dir, 'neurovault/collection_6618/')

In [4]:
def create_an_empty_folder(save_path, folder_name):
    if os.path.exists(f'{save_path}/{folder_name}'):
        shutil.rmtree(f'{save_path}/{folder_name}')
    os.mkdir(f'{save_path}/{folder_name}')

def classes_per_split(split, split_path):
    data = load_csv_to_pd(f'{split_path}/{split}.csv')
    classes = data.Label.unique()
    return classes

def load_csv_to_pd(filename):
    data = pd.read_csv(filename)
    return data

def make_dir(save_path, folder_name, subfolders_names):
    for name in subfolders_names:
        os.mkdir(f'{save_path}/{folder_name}/{name}')

def download_images(split_path, data_path, save_path, data):
    """
    Put the images of IBC in IBC folder, sorted by split and by class.
    
    Parameters:
        split_path -- path towards a folder containing train.csv, val.csv and test.csv.
        data_path -- path in which the data are stored.
        data -- nilearn dataset. here, nilearn.datasets.fetch_neurovault_ids(collection_ids=[6618]).
        save_path -- path towards the folder IBC.
    """
    # data:  Images and labels are accessible in nilearn.
    # We retrieve the classes of each split.
    train_classes = classes_per_split('train', split_path)
    val_classes = classes_per_split('val', split_path)
    test_classes = classes_per_split('test', split_path)
    # We read all images once at a time.
    # We look at the class of the image.
    # If the class corresponds to the name of a subfolder, we add the image in the subfolder.
    for i, meta in enumerate(data.images_meta):
        # Retrieve the class name.
        label = meta['contrast_definition']
        # Retrieve the actual path towards the parcellated image stored in a npz format.
        image_name = os.path.split(meta['relative_path'])[1]
        image_path = os.path.join(f'{data_path}', image_name)
        parcellation_path = os.path.splitext(os.path.splitext(image_path)[0])[0]+'.npz'
        # Only consider ap/pa images.
        name = meta['name'].split('_')
        if 'ffx' not in name:
            if label in val_classes:
                X = np.load(parcellation_path)['X']
                # Save the npz file in the right subfolder.
                image_number = parcellation_path.split('/')[-1]
                np.savez_compressed(f'{save_path}/val/{label}/{image_number}', X=X)
            elif label in train_classes:
                X = np.load(parcellation_path)['X']
                image_number = parcellation_path.split('/')[-1]
                # Save the npz file in the right subfolder.
                image_number = parcellation_path.split('/')[-1]
                np.savez_compressed(f'{save_path}/train/{label}/{image_number}', X=X)
            elif label in test_classes:
                X = np.load(parcellation_path)['X']
                # Save the npz file in the right subfolder.
                image_number = parcellation_path.split('/')[-1]
                np.savez_compressed(f'{save_path}/test/{label}/{image_number}', X=X)
            

In [5]:
# Make sure the folders train, val, test are empty.
# (Here, train is the base dataset, val the validation dataset and test, the novel dataset.)
create_an_empty_folder(save_path, 'train')
create_an_empty_folder(save_path, 'val')
create_an_empty_folder(save_path, 'test')
# Retrieve the list of classes in each split file.
train_classes = classes_per_split('train', split_path)
val_classes = classes_per_split('val', split_path)
test_classes = classes_per_split('test', split_path)
# In train, val, test create folders with the names of the classes.
make_dir(save_path, 'train', train_classes)
make_dir(save_path, 'val', val_classes)
make_dir(save_path, 'test', test_classes)
# Store the images in the folder named after their class. 
download_images(split_path, data_path, save_path, data)

In [6]:
# Check the number of classes
path = f"{save_path}/train"
print(len(os.listdir(path)))
path = f"{save_path}/val"
print(len(os.listdir(path)))
path = f"{save_path}/test"
print(len(os.listdir(path)))

65
22
19


In [7]:
# Check the number of images per split.
def images_per_split(save_path, split):
    path = f"{save_path}/{split}"
    count = 0
    for folder in os.listdir(path):
        for file in os.listdir(f'{path}/{folder}'):
            count += 1
    return count

print(images_per_split(save_path, 'train'))
print(images_per_split(save_path, 'val'))
print(images_per_split(save_path, 'test'))

2573
625
650
