<a href="https://colab.research.google.com/github/pablojrios/fluence_maps/blob/master/create_tfrecords/tf2_create_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def isGoogleColab():
    # 'ipykernel.zmqshell' runs in our server
    # 'google.colab._shell' runs in Google Colab
    return get_ipython().__class__.__module__ == 'google.colab._shell'

In [2]:
# import lodgepole.image_tools as lit doesn't work, the following is equivalent
# from importlib.machinery import SourceFileLoader
# somemodule = SourceFileLoader('lit', '/content/lodgepole/lodgepole/image_tools.py').load_module()
import sys
import time
import tensorflow as tf

from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from os import path
import os

In [3]:
print('Tensorflow version = {}'.format(tf.__version__))
print('Executing eagerly = {}'.format(tf.executing_eagerly()))

Tensorflow version = 2.2.0
Executing eagerly = True


In [4]:
if isGoogleColab():
    # if os.path.exists('lodgepole'):
    #     !rm -fr lodgepole

    # !git clone https://gitlab.com/brohrer/lodgepole.git
    # !pip install -e lodgepole

    %cd -q '/content'
    if os.path.exists('fluence_maps'):
        !rm -fr fluence_maps

    ## Install required dependencies
    !pip install -q pydicom

    GIT_USERNAME = "pablojrios"
    GIT_TOKEN = "1d88a0b85d2b00a03796e4d8b7e5f7b249b12f9b"
    !git clone -s https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/fluence_maps.git

    %cd -q '/content/fluence_maps/create_tfrecords'
        
    from google.colab import drive
    drive.mount('/content/drive')
       
    ARG_DATASET_DIR='/content/drive/My Drive/Healthcare/Radioterapia/data/ciolaplata'

In [5]:
from dataset_utils import _dataset_exists, _get_filenames_and_gamma_values, _convert_dataset
from tf2_oversampling_dicom_files import do_oversampling

In [6]:
# ===============================================DEFINE YOUR ARGUMENTS=================================================
if not isGoogleColab():
    ARG_DATASET_DIR='/hdd/data/radioterapia/ciolaplata'
# The number of shards to split the dataset into
ARG_NUM_SHARDS=1
ARG_VALIDATION_SIZE=0.2
# if False only training and validation partition are created.
ARG_TEST_PARTITION=False
# if cross validation is enabled then ARG_VALIDATION_SIZE and ARG_TEST_PARTITION are *not* taken into account.
# Further, oversampling (ARG_OVERSAMPLING) is not supported when creating a dataset for cross-validation.
ARG_CROSS_VALIDATION_ENABLED=True
# num folds cross validation (default is 5). Only taken into account if cross validation is enabled
ARG_NUM_CV_FOLDS=5
# Seed for repeatability.
ARG_RANDOM_SEED=23456
# folder under ARG_DATASET_DIR path.
ARG_TFDATASET_FOLDER=f'tfds.2019-2018-2017.localnorm.DS10%.{ARG_RANDOM_SEED}.gammaGT95.unsersampled'
# 0.1172 para 500 casos, 0.3515 para 1500 casos
ARG_SAMPLE_DATASET=1596/(7245-1596) # >0, 1.0 uses all cases (no sample is taken)
# file with gamma values under ARG_DATASET_DIR path.
# ARG_DICOM_AND_GAMMA_CSV='codex.2018-2019.csv' # mapas 3mm/3% con doseshift años 2018 y 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift.csv" # mapas 3mm/3% con doseshift año 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-2mm2%-doseshift.csv" # mapas 2mm/2% con doseshift año 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-TR40%.csv" # mapas con ajuste tolerancia 40%
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-40TH-localnorm.csv" # mapas CODEX 3mm 3% 40TH Local Norm año 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-40TH-localnorm-nooutliers.csv" # mapas CODEX 3mm 3% 40TH Local Norm año 2019 (remuevo outliers con gamma <= 65%, 6 en total)
# ARG_DICOM_AND_GAMMA_CSV = "codex.2019.3mm3%Doseshift10%-localnorm-sinoutliers74.csv"
# ARG_DICOM_AND_GAMMA_CSV = "codex.2018-2019.3mm3%Doseshift10%-localnorm-sinoutliers74.csv"
# ARG_DICOM_AND_GAMMA_CSV = "codex.2018-2019-3mm3%Doseshift10%-localnorm-undersampling95.csv"
ARG_DICOM_AND_GAMMA_CSV = "codex.2019-2018-2017.3mm3%Doseshift10%-localnorm-sinoutliers74.csv"
# ARG_DICOM_AND_GAMMA_CSV = "codex.2017.3mm3%Doseshift10%-localnorm-sinoutliers74.csv"
# ARG_DICOM_AND_GAMMA_CSV = "codex.2019-2017.3mm3%Doseshift10%-localnorm-sinoutliers74.csv" # 4269 casos
# ARG_IMAGE_TYPE: 0 - RGB; 1 - Grayscale: Convert color images to 3D grayscale images (channel is repeated 3 times);
# 2 - Dicom
ARG_IMAGE_TYPE=2
# if True copy of images is performed.
ARG_OVERSAMPLING=False
ARG_OVERSAMPLING_GAMMA_THRESHOLD = 97.0 # percentage
ARG_OVERSAMPLING_FACTOR = 3.0 # 1 is 100%

ARG_INCLUDE_GAMMA=True
ARG_INCLUDE_GAMMA_VALUE=95.0
# True: exclude maps with gammas >= ARG_FILTER_GAMMA_VALUE (i.e.: train a model with problematic gammas)
# False exclude maps with gammas < ARG_FILTER_GAMMA_VALUE
ARG_INCLUDE_GAMMA_LOWERTHAN=False

In [7]:
#=================================================CHECKS==============================================
# Check if there is a dataset directory entered
if ARG_DATASET_DIR == "":
    raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')
    
if ARG_TFDATASET_FOLDER == "":
    raise ValueError('tfdataset_folder is empty. Please state a tfdataset_dir argument.')
    
# If the TFRecord files already exist in the directory, then exit without creating the files again
tfdataset_dir = path.join(ARG_DATASET_DIR, ARG_TFDATASET_FOLDER)
if _dataset_exists(dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS):
    print(f'Dataset files already exist in {tfdataset_dir}. Exiting without re-creating them.')
    sys.exit()

if not (0 < ARG_SAMPLE_DATASET <= 1.0):
    print(f'Wrong value for input param ARG_SAMPLE_DATASET: {ARG_SAMPLE_DATASET}')
    sys.exit()
    
elif not tf.io.gfile.exists(tfdataset_dir):
    tf.io.gfile.mkdir(tfdataset_dir)
#==============================================END OF CHECKS==========================================

# Get a pandas dataframe of image full filenames and gamma indeces values.
df_dcm_out = _get_filenames_and_gamma_values(ARG_DICOM_AND_GAMMA_CSV, ARG_DATASET_DIR,
                                             sample=ARG_SAMPLE_DATASET, seed=ARG_RANDOM_SEED)
print(df_dcm_out.shape)
print(df_dcm_out.head(10))
if ARG_INCLUDE_GAMMA:
    if ARG_INCLUDE_GAMMA_LOWERTHAN:
        df_dcm_out = df_dcm_out.loc[df_dcm_out['gamma_index'] < ARG_INCLUDE_GAMMA_VALUE]
    else:
        df_dcm_out = df_dcm_out.loc[df_dcm_out['gamma_index'] >= ARG_INCLUDE_GAMMA_VALUE]
    print(df_dcm_out.shape)

(2046, 2)
                                    dicom_full_filepath  gamma_index
5987  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      98.5301
1607  /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...      97.5114
6181  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      98.7316
5497  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      97.9247
5270  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      97.4779
5451  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      97.8391
4561  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      93.0490
5981  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      98.5240
3673  /hdd/data/radioterapia/ciolaplata/2019/1.3.6.1...      88.1338
6742  /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...      99.2280
(1588, 2)


In [8]:
def write_dataset(tfdataset_dir, training_filenames, training_gamma, validation_filenames, validation_gamma,
                  testing_filenames=None, testing_gamma=None):
    
    start = time.time()
    print("Writing TF records to {}".format(tfdataset_dir))
    
    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, training_gamma,
                      dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

    if (ARG_CROSS_VALIDATION_ENABLED or len(validation_filenames) > 0):
        _convert_dataset('validation', validation_filenames, validation_gamma,
                          dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

        if not ARG_CROSS_VALIDATION_ENABLED and ARG_TEST_PARTITION:
            _convert_dataset('test', testing_filenames, testing_gamma,
                              dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

    end = time.time() - start
    print(f'Finished converting the dataset in {end:.2f} seconds.')    

In [9]:
def do_build_simple_partition():
    global df_dcm_out
    
    print("Create simple partition dataset.")
    
    print("Reading images from {}".format(tfdataset_dir))

    # Find the number of validation examples we need
    num_validation = int(ARG_VALIDATION_SIZE * len(df_dcm_out))

    print(f'\nNum. training images = {len(df_dcm_out) - num_validation}, num. validation images = {num_validation}')

    print(f'\nrandom seed partition = {ARG_RANDOM_SEED}')
    # Divide the training datasets into train and test:
    df_dcm_out = shuffle(df_dcm_out, random_state=ARG_RANDOM_SEED)

    # convert to list because a dataframe column is of type pandas...Series
    if not ARG_TEST_PARTITION:
        df_training = df_dcm_out[num_validation:]
        df_validation = df_dcm_out[:num_validation]
    else:
        df_training = df_dcm_out[num_validation*2:]
        df_validation = df_dcm_out[:num_validation]
        df_testing = df_dcm_out[num_validation:num_validation*2]

    # Hacer oversampling de mapas menores o iguales a un valor de gamma en df_training
    if ARG_OVERSAMPLING:
        df_training = do_oversampling(df_training, ARG_OVERSAMPLING_GAMMA_THRESHOLD, ARG_OVERSAMPLING_FACTOR)

    # convert to list because a dataframe column is of type pandas...Series
    if not ARG_TEST_PARTITION:
        training_filenames = df_training['dicom_full_filepath'].to_list()
        validation_filenames = df_validation['dicom_full_filepath'].to_list()
        training_gamma = df_training['gamma_index'].to_list()
        validation_gamma = df_validation['gamma_index'].to_list()
        write_dataset(tfdataset_dir, training_filenames, training_gamma, validation_filenames, validation_gamma)
    
    else:
        training_filenames = df_training['dicom_full_filepath'].to_list()
        validation_filenames = df_validation['dicom_full_filepath'].to_list()
        testing_filenames = df_testing['dicom_full_filepath'].to_list()
        training_gamma = df_training['gamma_index'].to_list()
        validation_gamma = df_validation['gamma_index'].to_list()
        testing_gamma = df_testing['gamma_index'].to_list()
        write_dataset(tfdataset_dir, training_filenames, training_gamma, validation_filenames, validation_gamma,
                      testing_filenames, testing_gamma)

In [10]:
def do_build_cv_partition():
    global df_dcm_out
    
    print(f"Create dataset for cross-validation with k={ARG_NUM_CV_FOLDS}")
    
    print("Reading images from {}".format(ARG_DATASET_DIR))
    print(f'Total images = {len(df_dcm_out)}')
    print(f'random seed partition = {ARG_RANDOM_SEED}')
    
    # shuffle added when we merge 2108 and 2019, it wasn't done with 2019 only maps.
    df_dcm_out = shuffle(df_dcm_out, random_state=ARG_RANDOM_SEED)
    
    filenames = df_dcm_out['dicom_full_filepath']
    gamma = df_dcm_out['gamma_index']
        
    k_fold = KFold(ARG_NUM_CV_FOLDS, shuffle=True, random_state=ARG_RANDOM_SEED)
    for k, (train, valid) in enumerate(k_fold.split(filenames, gamma)):

        print('first 5 filenames from training and validation:')
        print(filenames.iloc[train].head(5))
        print(filenames.iloc[valid].head(5))
        
        # preseleccion de genes con todos los casos (anteriormente se hizo en la partición de training)
        training_filenames = filenames.iloc[train].tolist()
        training_gamma = gamma.iloc[train].tolist()
        validation_filenames = filenames.iloc[valid].tolist()
        validation_gamma = gamma.iloc[valid].tolist()
                
        print(f'\nfold={k}, train size={len(training_filenames)}, validation size={len(validation_filenames)}')

        tfdataset_fold_dir = tfdataset_dir + ".fold" + str(k)
        if not tf.io.gfile.exists(tfdataset_fold_dir):
            tf.io.gfile.mkdir(tfdataset_fold_dir)
        
        write_dataset(tfdataset_fold_dir, training_filenames, training_gamma, validation_filenames, validation_gamma)

In [11]:
if not ARG_CROSS_VALIDATION_ENABLED:
    do_build_simple_partition()
else:
    do_build_cv_partition()

Create dataset for cross-validation with k=5
Reading images from /hdd/data/radioterapia/ciolaplata
Total images = 1588
random seed partition = 23456
first 5 filenames from training and validation:
2384    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
2116    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
2088    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
2342    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
4242    /hdd/data/radioterapia/ciolaplata/2019/1.3.6.1...
Name: dicom_full_filepath, dtype: object
1613    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
1914    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
1580    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
1682    /hdd/data/radioterapia/ciolaplata/2017/1.3.6.1...
6181    /hdd/data/radioterapia/ciolaplata/2018/1.3.6.1...
Name: dicom_full_filepath, dtype: object

fold=0, train size=1270, validation size=318
Writing TF records to /hdd/data/radioterapia/ciolaplata/tfds.2019-2018-2017.localnorm.DS10%.234

In [12]:
if isGoogleColab():
    drive.flush_and_unmount()
    print('All changes made in this colab session should now be visible in Drive.')