# README

This notebook allows to prepare dataset of slices.

Data are taken after creation of a dataset from nii.gz sequences with `0 - Prepare Size and Slices.ipynb`.

Dataset created by this notebook is a pair of dictionaries:
* first dictionary's key is the subject directory, value is the list of normalized slices
* second dictionary's key is the subject directory, value is the label for that subject

This structure will be used in random partitioning and shuffling to setup a set of labels and slices.

Dataset subject list for a given set of sequences is validated: only subjects that own all the sequences are considered: the others are filtered.

In [1]:
from gliomi import *

In [2]:
import numpy as np

In [7]:
class SliceProvider:
    
    def __init__(self, sequence_repo, full_brain=False):
        self.sequence_repo = sequence_repo
        self.full_brain = full_brain
        
    def has(self, subject, sequence_name):
        return self.sequence_repo.has(subject, sequence_name)
        
    """
    Save the slices of the whole brain reshaped with a squared size
    """
    def get_slices(self, sequence_name, subject, side):
        
        print("Loading slices from:", subject)
    
        return get_slices_for_subject(self.sequence_repo, 
                                      sequence_name, 
                                      subject, 
                                      side, 
                                      full_brain=self.full_brain)

In [27]:
import pickle
import glob
import os
import pandas as pd
from numpy import load
import numpy as np
import sys

class GliomiDataset:
    
    def __init__(self, subjects_dir, slice_provider):
        self.slice_provider = slice_provider
        self.subjects_dir = subjects_dir
    
    """
    Load a set of slices given the ROI sizes and the selected percentile for a single subject.
    Only slices with a ROI size greater or equal that corresponding percentile are considered.
    """
    def load_data_single_channel_for_subject(self, subject, sequence_name, side, percentile):
        
        roi = self.slice_provider.sequence_repo.get_roi(subject)
        
        ((rmin, rmax), (cmin, cmax), (zmin, zmax)) = get_bounding_box(roi)
    
        z_height = zmax - zmin
        width = rmax - rmin
        height = cmax - cmin
    
        resampled_roi = mask_crop_resize(roi, roi, width, height, z_height)

        roi_sizes = get_roi_size(resampled_roi, 2)

        # Load sizes
        with open(f"/data/RMN/dataset-gliomi-cnn/roi-sizes-brats19/roi-sizes-{subject}-{side}.pickle", "rb") as file:
            roi_sizes = pickle.load(file)
    
        # Load slices
        slices = self.slice_provider.get_slices(sequence_name, subject, side)

        print(f"Loaded {subject}: {slices.shape}, {roi_sizes.shape}")

        return slices[:,:,ordered_index_percentile_of_sizes(roi_sizes, percentile)]    
    
    """
    Load a set of slices given the ROI sizes and the selected percentile for a set of subjects.
    Only slices with a ROI size greater or equal that corresponding percentile are considered.
    Images channels correspondes to specified sequences given in the same order.
    """
    def load_data(self, subjects, sequence_names, side, percentile):
        slices = [
            np.stack([
                self.load_data_single_channel_for_subject(subject, sequence_name, side, percentile) 
                for sequence_name in sequence_names], axis=3)
            for subject in subjects
        ]
        return np.moveaxis(np.concatenate(slices, axis=2), 2, 0)
    
    """
    Return a valid list of subjects that has all the sequences specified.
    Subjects are selected between the list of specified
    """
    def get_valid_subject_list(self, subjects, sequence_names):
        ret = []
        for subject in subjects:
            admit = True
            for sequence_name in sequence_names:
                if not self.slice_provider.has(subject, sequence_name):
                    admit = False
            if admit:
                ret.append(subject)
        return ret
    
    """
    Return list of subjects in a dataset dir: all the directory names are returned as subjects
    """
    def get_subject_list(self):
        ret = []
        for file in glob.glob(f"{self.subjects_dir}/HGG_Training_data_Brats_2019/*"):
            if os.path.isdir(file):
                ret.append(os.path.basename(file))
        return ret
    

    """
    Given subject path and a table of subjects and labels for a given side
    Create a structure that hosts all related slices
    """
    def load_dataset(self, sequence_names, side, percentile):

        all_subjects = self.get_subject_list()

        # Return only subjects that has all sequences
        valid_subjects = self.get_valid_subject_list(
            all_subjects, 
            sequence_names)

        X = {}

        for i, subject in enumerate(valid_subjects):
            print(i, subject)
            images = self.load_data([subject], sequence_names, side, percentile)
            X[subject] = normalize(images, max_value=1., axis=(1, 2))

        return X

# Dataset Management

In [13]:
subjects_dir = "/data/RMN2/datasets/BRATS2019/HGG_Training_data_Brats_2019"

sequence_repo = SequenceRepoBrats19(subjects_dir)

slice_provider = SliceProvider(sequence_repo, full_brain=True)

dataset = GliomiDataset("/data/RMN2/datasets/BRATS2019/HGG_Training_data_Brats_2019", slice_provider)

In [29]:
side = 64

percentile = 100

for sequence_name in ["t1", "t1ce", "flair", "t2"]:
    
    X = dataset.load_dataset([sequence_name], side, percentile)
    
    with open(f"/data/RMN/dataset-gliomi-cnn/datasets-full-brain-brats19/{sequence_name}-{side}-{percentile}.pickle", "wb") as file:
        pickle.dump(X, file)

In [15]:
side = 64

In [17]:
percentile = 100

In [16]:
sequence_name="t1"

In [30]:
ret = []
for file in glob.glob(f"{subjects_dir}/HGG_Training_data_Brats_2019/*"):
    if os.path.isdir(file):
        ret.append(os.path.basename(file))
ret

['BraTS19_CBICA_AVJ_1',
 'BraTS19_TCIA01_221_1',
 'BraTS19_CBICA_AXQ_1',
 'BraTS19_TCIA03_138_1',
 'BraTS19_CBICA_AUN_1',
 'BraTS19_CBICA_APZ_1',
 'BraTS19_CBICA_ASF_1',
 'BraTS19_TCIA02_314_1',
 'BraTS19_CBICA_AQU_1',
 'BraTS19_TCIA02_274_1',
 'BraTS19_2013_17_1',
 'BraTS19_2013_23_1',
 'BraTS19_TCIA08_218_1',
 'BraTS19_CBICA_BBG_1',
 'BraTS19_TCIA03_498_1',
 'BraTS19_2013_4_1',
 'BraTS19_CBICA_AWH_1',
 'BraTS19_CBICA_BAN_1',
 'BraTS19_CBICA_BJY_1',
 'BraTS19_TCIA02_179_1',
 'BraTS19_CBICA_ASK_1',
 'BraTS19_CBICA_BKV_1',
 'BraTS19_TCIA02_606_1',
 'BraTS19_CBICA_BEM_1',
 'BraTS19_CBICA_ANZ_1',
 'BraTS19_TCIA08_162_1',
 'BraTS19_TCIA06_184_1',
 'BraTS19_CBICA_AVG_1',
 'BraTS19_TCIA01_378_1',
 'BraTS19_CBICA_AUA_1',
 'BraTS19_TCIA04_328_1',
 'BraTS19_TCIA06_165_1',
 'BraTS19_TCIA06_372_1',
 'BraTS19_TCIA03_474_1',
 'BraTS19_CBICA_ATV_1',
 'BraTS19_CBICA_AYI_1',
 'BraTS19_CBICA_ATN_1',
 'BraTS19_CBICA_AQZ_1',
 'BraTS19_CBICA_ARF_1',
 'BraTS19_CBICA_AAL_1',
 'BraTS19_TCIA01_131_1',
 'BraTS

In [31]:
all_subjects=ret

In [33]:
def get_bounding_box(mask):

    roi = mask.dataobj

    r = np.any(roi, axis=(1, 2))
    c = np.any(roi, axis=(0, 2))
    z = np.any(roi, axis=(0, 1))

    rmin, rmax = np.where(r)[0][[0, -1]]
    cmin, cmax = np.where(c)[0][[0, -1]]
    zmin, zmax = np.where(z)[0][[0, -1]]

    return ((rmin, rmax), (cmin, cmax), (zmin, zmax))

In [37]:
X = {}

for i, subject in enumerate(all_subjects):
    print(i, subject)
    
    roi = nb.load(f"{subjects_dir}/HGG_Training_data_Brats_2019/{subject}/{subject}_seg.nii.gz")
    
    ((rmin, rmax), (cmin, cmax), (zmin, zmax)) = get_bounding_box(roi)
    
    print(rmin,rmax,cmin,cmax,zmin,zmax)
    
    z_height = zmax - zmin
    
    width = rmax - rmin
    
    height = cmax - cmin
    
    resampled_roi = mask_crop_resize(roi, roi, width, height, z_height)

    roi_sizes = get_roi_size(resampled_roi, 2)

        # Load sizes
    with open(f"/data/RMN/dataset-gliomi-cnn/roi-sizes-brats19/roi-sizes-{subject}-{side}.pickle", "rb") as file:
        roi_sizes = pickle.load(file)
        
        print(roi_sizes.shape)
    
    # Load slices
        slices = self.slice_provider.get_slices(sequence_name, subject, side)

        print(f"Loaded {subject}: {slices.shape}, {roi_sizes.shape}")

        return slices[:,:,ordered_index_percentile_of_sizes(roi_sizes, percentile)]    
    
    slices = [
            np.stack([
                self.load_data_single_channel_for_subject(subject, sequence_name, side, percentile) 
                for sequence_name in sequence_names], axis=3)
            for subject in subjects
        ]
    X=np.moveaxis(np.concatenate(slices, axis=2), 2, 0)
    
    #images = self.load_data([subject], sequence_names, side, percentile)
    #X[subject] = normalize(images, max_value=1., axis=(1, 2))

X'''

0 BraTS19_CBICA_AVJ_1
54 116 92 159 47 135
(88,)
1 BraTS19_TCIA01_221_1
61 117 93 167 71 139
(68,)
2 BraTS19_CBICA_AXQ_1
82 119 128 193 62 125
(63,)
3 BraTS19_TCIA03_138_1
117 180 77 163 30 97
(67,)
4 BraTS19_CBICA_AUN_1
65 145 63 155 61 142
(81,)
5 BraTS19_CBICA_APZ_1
115 183 81 163 28 103
(75,)
6 BraTS19_CBICA_ASF_1
51 120 84 173 30 112
(82,)
7 BraTS19_TCIA02_314_1
112 177 77 194 26 114
(88,)
8 BraTS19_CBICA_AQU_1
57 110 108 200 45 132
(87,)
9 BraTS19_TCIA02_274_1
92 175 51 151 51 128
(77,)
10 BraTS19_2013_17_1
81 173 42 130 55 135
(80,)
11 BraTS19_2013_23_1
108 178 51 130 46 117
(71,)
12 BraTS19_TCIA08_218_1


KeyboardInterrupt: 

In [26]:
import pandas as pd

def save_dataset(subjects_dir, sequence_names, side, percentile, output_dir):

    all_subjects = get_subject_list(subjects_dir)

    X = load_dataset(subjects_dir, all_subjects, sequence_names, side, percentile)
    
    sequences = "-".join(sequence_names).lower()
    
    with open(f"{output_dir}/{sequences}-{side}-{percentile}-perc.pickle", "wb") as file:
        pickle.dump(X, file)

In [None]:
import pickle

def get_dataset_for_classification(dataset_path, classification_path):
    
    with open(dataset_path, "rb") as file:
        X = pickle.load(file)
        
    df = pd.read_csv(classification_path)
    subjects = np.array(df.iloc[:,1])
    labels = np.array(df.iloc[:,2])
    
    X_new = {}
    y_new = {}
    
    for i, subject in enumerate(subjects):
        if subject in X:
            X_new[subject] = X[subject]
            y_new[subject] = labels[i]
    
    return X_new, y_new

# Generate Datasets

In [None]:
%%time

root_path = "/data/RMN/dataset-gliomi-cnn"

for dataset in [f"{root_path}/slices-tumor-crop"]:

    for sequence_name in ["T1", "T2", "rCBV", "ADC", "FLAIR", "MPRAGE"]:

        for percentile in [100]:

            print(dataset, sequence_name, percentile)
            
            save_dataset(dataset, [sequence_name], 224, percentile, "/data/RMN/dataset-gliomi-cnn/2-datasets-tumor-crop")        

In [None]:
%%time

root_path = "/data/RMN/dataset-gliomi-cnn"

for dataset in [f"{root_path}/slices-full-brain"]:

    for sequence_name in ["T1", "T2", "rCBV", "ADC", "FLAIR", "MPRAGE"]:

        for percentile in [100]:

            print(dataset, sequence_name, percentile)
            
            save_dataset(dataset, [sequence_name], 224, percentile, "/data/RMN/dataset-gliomi-cnn/2-datasets-full-brain")        

In [None]:
X, y = get_dataset_for_classification(f"{root_path}/2-datasets-tumor-crop/t1-224-100-perc.pickle", 
                                      f"{root_path}/dataset-survivor.csv")

In [None]:
for dataframe in [f"{root_path}/dataset-survivor.csv", 
                  f"{root_path}/dataset-mgmt.csv", 
                  f"{root_path}/dataset-idh.csv", 
                  f"{root_path}/dataset-ki67.csv", 
                  f"{root_path}/dataset-egfr.csv"]:



In [None]:
%%time

root_path = "/data/RMN/dataset-gliomi-cnn"

for dataframe in [f"{root_path}/dataset-survivor.csv", 
                  f"{root_path}/dataset-mgmt.csv", 
                  f"{root_path}/dataset-idh.csv", 
                  f"{root_path}/dataset-ki67.csv", 
                  f"{root_path}/dataset-egfr.csv"]:

    for dataset in [f"{root_path}/slices-full-brain"]:
        
            for sequence_name in ["T1", "T2", "rCBV", "ADC", "FLAIR", "MPRAGE"]:
                
                for percentile in [100]:
                    
                    print(dataset, dataframe, sequence_name, percentile)
                    save_dataset(dataset, dataframe, [sequence_name], 224, percentile, "/data/RMN/dataset-gliomi-cnn/2-datasets-full-brain")
        