# README

This notebook allows to prepare dataset of slices.

Data are taken after creation of a dataset from nii.gz sequences with `0 - Prepare Size and Slices.ipynb`.

Dataset created by this notebook is a pair of dictionaries:
* first dictionary's key is the subject directory, value is the list of normalized slices
* second dictionary's key is the subject directory, value is the label for that subject

This structure will be used in random partitioning and shuffling to setup a set of labels and slices.

Dataset subject list for a given set of sequences is validated: only subjects that own all the sequences are considered: the others are filtered.

In [1]:
from gliomi import *

In [2]:
import numpy as np

In [3]:
## import pickle

"""
Load a set of slices given the ROI sizes and the selected percentile for a single subject.
Only slices with a ROI size greater or equal that corresponding percentile are considered.
"""
def load_data_single_channel_for_subject(subjects_dir, subject, sequence_name, side, percentile):
    
    # Load sizes
    with open(f"{subjects_dir}/../roi-sizes/roi-sizes-{subject}-{side}.pickle", "rb") as file:
        roi_sizes = pickle.load(file)
    
    # Load slices
    with open(f"{subjects_dir}/{subject}/slices-{sequence_name}-{side}.pickle", "rb") as file:
        slices = pickle.load(file)
        
    print(subject, ": ", slices.shape, roi_sizes.shape)
        
    return slices[:,:,ordered_index_percentile_of_sizes(roi_sizes, percentile)]

In [4]:
"""
Load a set of slices given the ROI sizes and the selected percentile for a set of subjects.
Only slices with a ROI size greater or equal that corresponding percentile are considered.
Images channels correspondes to specified sequences given in the same order.
"""
def load_data(subjects_dir, subjects, sequence_names, side, percentile):
    slices = [
        np.stack([
            load_data_single_channel_for_subject(subjects_dir, subject, sequence_name, side, percentile) 
            for sequence_name in sequence_names], axis=3)
        for subject in subjects
    ]
    return np.moveaxis(np.concatenate(slices, axis=2), 2, 0)

In [5]:
import os

"""
Return a valid list of subjects that has all the sequences specified.
Subjects are selected between the list of specified
"""
def get_valid_subject_list(subjects_dir, subjects, sequence_names, side):
    ret = []
    for subject in subjects:
        admit = True
        for sequence_name in sequence_names:
            if not os.path.exists(f"{subjects_dir}/{subject}/slices-{sequence_name}-{side}.pickle"):
                admit = False
        if admit:
            ret.append(subject)
    return ret

In [6]:
import os

"""
Return a valid list of subjects that has all the sequences specified.
Subjects are selected between the list of specified.
"""
def get_valid_subject_list(subjects_dir, subjects, sequence_names, side):
    X = []
    for subject in subjects:
        admit = True
        for sequence_name in sequence_names:
            if not os.path.exists(f"{subjects_dir}/{subject}/slices-{sequence_name}-{side}.pickle"):
                admit = False
        if admit:
            X.append(subject)
    return X

In [7]:
import glob

"""
Return list of subjects in a dataset dir: all the directory names are returned as subjects
"""
def get_subject_list(subjects_dir):
    ret = []
    for file in glob.glob(f"{subjects_dir}/*"):
        if os.path.isdir(file):
            ret.append(os.path.basename(file))
    return ret

# Dataset Management

In [8]:
import pandas as pd
from numpy import load
import numpy as np
import sys

"""
Given subject path and a table of subjects and labels for a given side
Create a structure that hosts all related slices
"""
def load_dataset(subjects_dir, subjects, sequence_names, side, percentile):

    # Return only subjects that has all sequences
    valid_subjects = get_valid_subject_list(
        subjects_dir, 
        subjects, 
        sequence_names, 
        side)
    
    X = {}
    
    for subject in valid_subjects:
        images = load_data(subjects_dir, [subject], sequence_names, side, percentile)
        X[subject] = normalize(images, max_value=1., axis=(1, 2))
        
    return X

In [9]:
import pandas as pd

def save_dataset(subjects_dir, sequence_names, side, percentile, output_dir):

    all_subjects = get_subject_list(subjects_dir)

    X = load_dataset(subjects_dir, all_subjects, sequence_names, side, percentile)
    
    sequences = "-".join(sequence_names).lower()
    
    with open(f"{output_dir}/{sequences}-{side}-{percentile}-perc.pickle", "wb") as file:
        pickle.dump(X, file)

In [10]:
import pickle

def get_dataset_for_classification(dataset_path, classification_path):
    
    with open(dataset_path, "rb") as file:
        X = pickle.load(file)
        
    df = pd.read_csv(classification_path)
    subjects = np.array(df.iloc[:,1])
    labels = np.array(df.iloc[:,2])
    
    X_new = {}
    y_new = {}
    
    for i, subject in enumerate(subjects):
        if subject in X:
            X_new[subject] = X[subject]
            y_new[subject] = labels[i]
    
    return X_new, y_new

# Generate Datasets

In [11]:
%%time

root_path = "/data/RMN/dataset-gliomi-cnn"

for dataset in [f"{root_path}/slices-tumor-crop"]:

    for sequence_name in ["T1", "T2", "rCBV", "ADC", "FLAIR", "MPRAGE"]:

        for percentile in [100]:

            print(dataset, sequence_name, percentile)
            
            save_dataset(dataset, [sequence_name], 224, percentile, "/data/RMN/dataset-gliomi-cnn/2-datasets-tumor-crop")        

/data/RMN/dataset-gliomi-cnn/slices-tumor-crop T1 100
ALESSANDRINI_GLAUCO :  (224, 224, 63) (63,)
ANGELONI_GIUSEPPINA :  (224, 224, 16) (16,)
ASSANTO_MARIA :  (224, 224, 92) (92,)
BAGNOLI_VINCENZO :  (224, 224, 92) (92,)
BARONTINI_MARIA_GIOVANNA :  (224, 224, 56) (56,)
BATTISTA_DOMENICA :  (224, 224, 54) (54,)
BERGNACH_SILVANO :  (224, 224, 68) (68,)
BERNOLA_TERESA :  (224, 224, 32) (32,)
BERTUZZI_LUISA :  (224, 224, 89) (89,)
BEVILACQUA_RITA :  (224, 224, 42) (42,)
BIANCHI_GIOVANNI :  (224, 224, 14) (14,)
BIANCHI_ORAZIO :  (224, 224, 69) (69,)
BIAVATI_S :  (224, 224, 168) (168,)
BOEZI_MARIO :  (224, 224, 78) (78,)
BOVE_A :  (224, 224, 123) (123,)
CACACE_PAOLO :  (224, 224, 74) (74,)
CALDARONI_ANNA :  (224, 224, 62) (62,)
CAMACCI_FILIBERTO :  (224, 224, 77) (77,)
CAMPLESE_CANDEROLA :  (224, 224, 71) (71,)
CAPEZZONE :  (224, 224, 80) (80,)
CARULLI_L :  (224, 224, 95) (95,)
CARZEDDA_PAOLO :  (224, 224, 56) (56,)
CATALANI_F :  (224, 224, 134) (134,)
CIMPUREANU_N :  (224, 224, 106) (106,)


MARTINEZ :  (224, 224, 69) (69,)
MASCI_ADA :  (224, 224, 75) (75,)
MEDICI_GIOVANNA :  (224, 224, 6) (6,)
MICHELI_MICHELE :  (224, 224, 59) (59,)
MITCHELL_CHARLENE_ANN :  (224, 224, 73) (73,)
MONACELLI_LAURA :  (224, 224, 70) (70,)
MONTI_E :  (224, 224, 81) (81,)
MOSCARDINI_GIACINTO :  (224, 224, 55) (55,)
MOVIA_A :  (224, 224, 120) (120,)
MUSAT_DORINA :  (224, 224, 32) (32,)
NERONE_GIANLUCA :  (224, 224, 67) (67,)
NERVEGNA_G :  (224, 224, 148) (148,)
ORLANDI_PAOLO :  (224, 224, 6) (6,)
PAGANNONE_GIANNI :  (224, 224, 67) (67,)
PAGLIAROLI_LUCIA :  (224, 224, 39) (39,)
PAGNOTTA :  (224, 224, 59) (59,)
PALMA :  (224, 224, 73) (73,)
PALMIERI :  (224, 224, 83) (83,)
PANETTI :  (224, 224, 90) (90,)
PASCAL :  (224, 224, 33) (33,)
PASSARI :  (224, 224, 68) (68,)
PELUSO_A :  (224, 224, 58) (58,)
PENNICCHI_R :  (224, 224, 137) (137,)
PIERI :  (224, 224, 52) (52,)
PINEDA_MARIA_ASSUNTA :  (224, 224, 88) (88,)
PISTOIA_CARLO :  (224, 224, 41) (41,)
PODAGROSI_TERESA :  (224, 224, 71) (71,)
PODDA_ANTON

  images_centered = (images - u_extended) / s_extended


MAIOLINI_SANTA :  (224, 224, 52) (52,)
MARAGNO_CLARA :  (224, 224, 55) (55,)
MARCOLINI :  (224, 224, 66) (66,)
MARIANI_BERNARDO :  (224, 224, 66) (66,)
MARTINEZ :  (224, 224, 69) (69,)
MASCI_ADA :  (224, 224, 75) (75,)
MEDICI_GIOVANNA :  (224, 224, 6) (6,)
MICHELI_MICHELE :  (224, 224, 59) (59,)
MITCHELL_CHARLENE_ANN :  (224, 224, 73) (73,)
MOSCARDINI_GIACINTO :  (224, 224, 55) (55,)
MOVIA_A :  (224, 224, 120) (120,)
NERONE_GIANLUCA :  (224, 224, 67) (67,)
ORLANDI_PAOLO :  (224, 224, 6) (6,)
PAGLIAROLI_LUCIA :  (224, 224, 39) (39,)
PAGNOTTA :  (224, 224, 59) (59,)
PALMA :  (224, 224, 73) (73,)
PALMIERI :  (224, 224, 83) (83,)
PANETTI :  (224, 224, 90) (90,)
PASCAL :  (224, 224, 33) (33,)
PASSARI :  (224, 224, 68) (68,)
PIERI :  (224, 224, 52) (52,)
PISTOIA_CARLO :  (224, 224, 41) (41,)
POMPEI_F :  (224, 224, 99) (99,)
PRINCIPI_ANNA_MARIA :  (224, 224, 72) (72,)
PROIETTI_GIOVANNI :  (224, 224, 51) (51,)
PROIETTI_MARIA :  (224, 224, 90) (90,)
QUACQUARELLI_A :  (224, 224, 57) (57,)
QUATTR

BERGNACH_SILVANO :  (224, 224, 68) (68,)
BERNOLA_TERESA :  (224, 224, 32) (32,)
BERTUZZI_LUISA :  (224, 224, 89) (89,)
BEVILACQUA_RITA :  (224, 224, 42) (42,)
BIANCHI_GIOVANNI :  (224, 224, 14) (14,)
BIANCHI_ORAZIO :  (224, 224, 69) (69,)
BIAVATI_S :  (224, 224, 168) (168,)
BOEZI_MARIO :  (224, 224, 78) (78,)
BOVE_A :  (224, 224, 123) (123,)
CACACE_PAOLO :  (224, 224, 74) (74,)
CALDARONI_ANNA :  (224, 224, 62) (62,)
CAMACCI_FILIBERTO :  (224, 224, 77) (77,)
CAMPLESE_CANDEROLA :  (224, 224, 71) (71,)
CAPEZZONE :  (224, 224, 80) (80,)
CARULLI_L :  (224, 224, 95) (95,)
CARZEDDA_PAOLO :  (224, 224, 56) (56,)
CATALANI_F :  (224, 224, 134) (134,)
CELLINI_T :  (224, 224, 161) (161,)
CHERRI_M :  (224, 224, 153) (153,)
CIMPUREANU_N :  (224, 224, 106) (106,)
COLAFRANCESCO_ROCCO :  (224, 224, 97) (97,)
COLAMARTINI_GIUSEPPINA :  (224, 224, 57) (57,)
COLAZZO_LUIGI_GIUSEPPE :  (224, 224, 70) (70,)
COLETTA_MARIA :  (224, 224, 64) (64,)
COSIMI_MASSIMO :  (224, 224, 64) (64,)
COSTANZI_P :  (224, 224, 1

GIACCHERINI_M :  (224, 224, 104) (104,)
GIANFELICI_LUISA :  (224, 224, 48) (48,)
GIOIA_COSMO_DAMIANO :  (224, 224, 71) (71,)
GIORDANO_STEFANIA :  (224, 224, 73) (73,)
INCITI_DONATA :  (224, 224, 68) (68,)
IONTA_LUCIANA :  (224, 224, 38) (38,)
ISMAIL_A :  (224, 224, 109) (109,)
ISONI_FRANCESCO :  (224, 224, 68) (68,)
LABELLA_ADRIANA :  (224, 224, 93) (93,)
LANDONE_ANNUNZIATA :  (224, 224, 83) (83,)
LIBERATI_G_L :  (224, 224, 160) (160,)
LIOCE_CARMELA :  (224, 224, 56) (56,)
LONGO_ROSALIA :  (224, 224, 65) (65,)
LO_BELLO_MARIO :  (224, 224, 81) (81,)
LUPI_GIANCARLO :  (224, 224, 79) (79,)
LUPO_ASSUNTA :  (224, 224, 61) (61,)
MAIOLINI_SANTA :  (224, 224, 52) (52,)
MARAGNO_CLARA :  (224, 224, 55) (55,)
MARCOLINI :  (224, 224, 66) (66,)
MARCONI_E :  (224, 224, 118) (118,)
MARIANI_BERNARDO :  (224, 224, 66) (66,)
MAROCCHI_CORRADO :  (224, 224, 83) (83,)
MARTELLA_COSIMO :  (224, 224, 34) (34,)
MARTINEZ :  (224, 224, 69) (69,)
MASCI_ADA :  (224, 224, 75) (75,)
MEDICI_GIOVANNA :  (224, 224, 6) 

In [None]:
%%time

root_path = "/data/RMN/dataset-gliomi-cnn"

for dataset in [f"{root_path}/slices-full-brain"]:

    for sequence_name in ["T1", "T2", "rCBV", "ADC", "FLAIR", "MPRAGE"]:

        for percentile in [100]:

            print(dataset, sequence_name, percentile)
            
            save_dataset(dataset, [sequence_name], 224, percentile, "/data/RMN/dataset-gliomi-cnn/2-datasets-full-brain")        