<a href="https://colab.research.google.com/github/pablojrios/fluence_maps/blob/master/create_tfrecords/tf2_create_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def isGoogleColab():
    # 'ipykernel.zmqshell' runs in our server
    # 'google.colab._shell' runs in Google Colab
    return get_ipython().__class__.__module__ == 'google.colab._shell'

In [2]:
# import lodgepole.image_tools as lit doesn't work, the following is equivalent
# from importlib.machinery import SourceFileLoader
# somemodule = SourceFileLoader('lit', '/content/lodgepole/lodgepole/image_tools.py').load_module()
import sys
import time
import tensorflow as tf

from dataset_utils import _dataset_exists, _get_filenames_and_gamma_values, _convert_dataset
from sklearn.utils import shuffle
from os import path
from tf2_oversampling_dicom_files import do_oversampling
import os

In [3]:
print('Tensorflow version = {}'.format(tf.__version__))
print('Executing eagerly = {}'.format(tf.executing_eagerly()))

Tensorflow version = 2.2.0
Executing eagerly = True


In [4]:
if isGoogleColab():
    # if os.path.exists('lodgepole'):
    #     !rm -fr lodgepole

    # !git clone https://gitlab.com/brohrer/lodgepole.git
    # !pip install -e lodgepole

    %cd -q '/content'
    if os.path.exists('fluence_maps'):
        !rm -fr fluence_maps

    ## Install required dependencies
    !pip install -q pydicom

    GIT_USERNAME = "pablojrios"
    GIT_TOKEN = "1d88a0b85d2b00a03796e4d8b7e5f7b249b12f9b"
    !git clone -s https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/fluence_maps.git

    from google.colab import drive
    drive.mount('/content/drive')
    
    ARG_DATASET_DIR='/content/drive/My Drive/Healthcare/Radioterapia/data/ciolaplata'

In [5]:
# ===============================================DEFINE YOUR ARGUMENTS=================================================
if not isGoogleColab():
    ARG_DATASET_DIR='/hdd/data/radioterapia/ciolaplata'
# The number of shards to split the dataset into
ARG_NUM_SHARDS=4
ARG_VALIDATION_SIZE=0.2
# Seed for repeatability.
ARG_RANDOM_SEED=12345
# folder under ARG_DATASET_DIR path.
ARG_TFDATASET_FOLDER=f'tfds.2019.localnorm.{ARG_RANDOM_SEED}.ovs97x3'
# file with gamma values under ARG_DATASET_DIR path.
# ARG_DICOM_AND_GAMMA_CSV='codex.2018-2019.csv' # mapas 3mm/3% con doseshift años 2018 y 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift.csv" # mapas 3mm/3% con doseshift año 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-2mm2%-doseshift.csv" # mapas 2mm/2% con doseshift año 2019
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-TR40%.csv" # mapas con ajuste tolerancia 40%
# ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-40TH-localnorm.csv" # mapas CODEX 3mm 3% 40TH Local Norm año 2019
ARG_DICOM_AND_GAMMA_CSV = "codex-2019-3mm3%-doseshift-40TH-localnorm-nooutliers.csv" # mapas CODEX 3mm 3% 40TH Local Norm año 2019 (remuevo outliers con gamma <= 65%, 6 en total)

# ARG_IMAGE_TYPE: 0 - RGB; 1 - Grayscale: Convert color images to 3D grayscale images (channel is repeated 3 times);
# 2 - Dicom
ARG_IMAGE_TYPE=2
# if False only training and validation partition are created.
ARG_TEST_PARTITION=False
# if True copy of images is performed.
ARG_OVERSAMPLING=True
ARG_OVERSAMPLING_GAMMA_THRESHOLD = 97.0 # percentage
ARG_OVERSAMPLING_FACTOR = 3.0 # 1 is 100%

In [6]:
#=================================================CHECKS==============================================
# Check if there is a dataset directory entered
if ARG_DATASET_DIR == "":
    raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')

if ARG_TFDATASET_FOLDER == "":
    raise ValueError('tfdataset_folder is empty. Please state a tfdataset_dir argument.')

# If the TFRecord files already exist in the directory, then exit without creating the files again
tfdataset_dir = path.join(ARG_DATASET_DIR, ARG_TFDATASET_FOLDER)
if _dataset_exists(dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS):
    print('Dataset files already exist. Exiting without re-creating them.')
    sys.exit()

elif not tf.io.gfile.exists(tfdataset_dir):
    tf.io.gfile.mkdir(tfdataset_dir)

print("Reading images from {} and writing TF records to {}".format(ARG_DATASET_DIR, tfdataset_dir))

#==============================================================END OF CHECKS===================================================================
# Get a pandas dataframe of image full filenames and gamma indeces values.
df_dcm_out = _get_filenames_and_gamma_values(ARG_DICOM_AND_GAMMA_CSV, ARG_DATASET_DIR)

# Find the number of validation examples we need
num_validation = int(ARG_VALIDATION_SIZE * len(df_dcm_out))

print(f'\nNum. training images = {len(df_dcm_out) - num_validation}, num. validation images = {num_validation}')

print(f'\nrandom seed partition = {ARG_RANDOM_SEED}')
# Divide the training datasets into train and test:
df_dcm_out = shuffle(df_dcm_out, random_state=ARG_RANDOM_SEED)

# convert to list because a dataframe column is of type pandas...Series
if not ARG_TEST_PARTITION:
    df_training = df_dcm_out[num_validation:]
    df_validation = df_dcm_out[:num_validation]
else:
    df_training = df_dcm_out[num_validation*2:]
    df_validation = df_dcm_out[:num_validation]
    df_testing = df_dcm_out[num_validation:num_validation*2]

# Hacer oversampling de mapas menores o iguales a un valor de gamma en df_training
if ARG_OVERSAMPLING:
    df_training = do_oversampling(df_training, ARG_OVERSAMPLING_GAMMA_THRESHOLD, ARG_OVERSAMPLING_FACTOR)

# convert to list because a dataframe column is of type pandas...Series
if not ARG_TEST_PARTITION:
    training_filenames = df_training['dicom_full_filepath'].to_list()
    validation_filenames = df_validation['dicom_full_filepath'].to_list()
    training_gamma = df_training['gamma_index'].to_list()
    validation_gamma = df_validation['gamma_index'].to_list()
else:
    training_filenames = df_training['dicom_full_filepath'].to_list()
    validation_filenames = df_validation['dicom_full_filepath'].to_list()
    testing_filenames = df_testing['dicom_full_filepath'].to_list()
    training_gamma = df_training['gamma_index'].to_list()
    validation_gamma = df_validation['gamma_index'].to_list()
    testing_gamma = df_testing['gamma_index'].to_list()

start = time.time()
# First, convert the training and validation sets.
_convert_dataset('train', training_filenames, training_gamma,
                  dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

if num_validation > 0:
    _convert_dataset('validation', validation_filenames, validation_gamma,
                      dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

    if not not ARG_TEST_PARTITION:
        _convert_dataset('test', testing_filenames, testing_gamma,
                          dataset_dir = tfdataset_dir, _NUM_SHARDS = ARG_NUM_SHARDS, image_type = ARG_IMAGE_TYPE)

end = time.time() - start
print(f'\nFinished converting the dataset in {end} seconds')

Reading images from /hdd/data/radioterapia/ciolaplata and writing TF records to /hdd/data/radioterapia/ciolaplata/tfds.2019.localnorm.12345.ovs97x3

Num. training images = 978, num. validation images = 244

random seed partition = 12345
Hay 148 mapas con un gamma menor o igual que 97.0 y 830 con un gamma mayor, sobre un total de 978 mapas.
Oversampling factor es 3.00, se van a hacer 444 copias al azar de mapas con un gamma menor o igual que 97.0.
Ovesampling completado, total de mapas despues del oversampling: 1422.
>> Image 1422/1422 with 3851 bytes in shard 3 converted
>> Image 244/244 with 4601 bytes in shard 3 converted

Finished converting the dataset in 4.894845485687256 seconds


In [7]:
if isGoogleColab():
    drive.flush_and_unmount()
    print('All changes made in this colab session should now be visible in Drive.')