In [2]:
import math
import os
import random
import tarfile
import urllib

from absl import app
from absl import flags
import tensorflow as tf
import pandas as pd
import numpy as np

In [3]:
TRAINING_SHARDS = 32
VALIDATION_SHARDS = 16

TRAINING_DIRECTORY = 'train'
VALIDATION_DIRECTORY = 'valid'

path = './../../../../main/'

In [4]:
def _int64_feature(value):
  """Wrapper for inserting int64 features into Example proto."""
  if not isinstance(value, list):
    value = [value]
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _floats_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))

def _bytes_feature(value):
  """Wrapper for inserting bytes features into Example proto."""
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _check_or_create_dir(directory):
  """Check if directory exists otherwise create it."""
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)

In [5]:
def _convert_to_example(filename, image_buffer, label, height, width):
  """Build an Example proto for an example.

  Args:
    filename: string, path to an image file, e.g., '/path/to/example.JPG'
    image_buffer: string, JPEG encoding of RGB image
    label: list, identifier for the ground truth for the network
    height: integer, image height in pixels
    width: integer, image width in pixels
  Returns:
    Example proto
  """
  colorspace = b'RGB'
  channels = 3
  image_format = b'JPEG'

  example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': _int64_feature(height),
      'image/width': _int64_feature(width),
      'image/colorspace': _bytes_feature(colorspace),
      'image/channels': _int64_feature(channels),
      'image/label': _floats_feature(label),
      'image/format': _bytes_feature(image_format),
      'image/encoded': _bytes_feature(image_buffer)}))
  return example

In [6]:
class ImageCoder(object):
  """Helper class that provides TensorFlow image coding utilities."""

  def __init__(self):
    # Create a single Session to run all image coding calls.
    self._sess = tf.Session()

    # Initializes function that decodes RGB JPEG data.
    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)


  def decode_jpeg(self, image_data):
    image = self._sess.run(self._decode_jpeg,
                           feed_dict={self._decode_jpeg_data: image_data})
    assert len(image.shape) == 3
    assert image.shape[2] == 3
    return image

In [7]:
def _process_image(filename, coder):
  """Process a single image file.

  Args:
    filename: string, path to an image file e.g., '/path/to/example.JPG'.
    coder: instance of ImageCoder to provide TensorFlow image coding utils.
  Returns:
    image_buffer: string, JPEG encoding of RGB image.
    height: integer, image height in pixels.
    width: integer, image width in pixels.
  """
  # Read the image file.
  filename = path+filename
  with tf.gfile.FastGFile(filename, 'rb') as f:
    image_data = f.read()
  

  # Decode the RGB JPEG.
  image = coder.decode_jpeg(image_data)
  image = tf.image.resize_images(image, (256,256))
  
  # Check that image converted to RGB
  assert len(image.shape) == 3
  height = image.shape[0]
  width = image.shape[1]
  assert image.shape[2] == 3

  return image_data, height, width

In [8]:
def _process_image_files_batch(coder, output_file, filenames, labels):
  """Processes and saves list of images as TFRecords.
  Args:
    coder: instance of ImageCoder to provide TensorFlow image coding utils.
    output_file: string, unique identifier specifying the data set
    filenames: list of strings; each string is a path to an image file
  """
  writer = tf.python_io.TFRecordWriter(output_file)

  for filename, label in zip(filenames, labels):
    image_buffer, height, width = _process_image(filename, coder)
    example = _convert_to_example(filename, image_buffer, label,
                                   height, width)
    writer.write(example.SerializeToString())

  writer.close()

In [9]:
def _process_dataset(filenames, labels, output_directory, prefix,
                     num_shards):
  """Processes and saves list of images as TFRecords.

  Args:
    filenames: list of strings; each string is a path to an image file
    synsets: list of strings; each string is a unique WordNet ID
    labels: map of string to integer; id for all synset labels
    output_directory: path where output files should be created
    prefix: string; prefix for each file
    num_shards: number of chucks to split the filenames into

  Returns:
    files: list of tf-record filepaths created from processing the dataset.
  """
  _check_or_create_dir(output_directory)
  chunksize = int(math.ceil(len(filenames) / num_shards))
  coder = ImageCoder()

  files = []

  for shard in range(num_shards):
    chunk_files = filenames[shard * chunksize : (shard + 1) * chunksize]
    output_file = os.path.join(
        output_directory, '%s-%.5d-of-%.5d' % (prefix, shard, num_shards))
    _process_image_files_batch(coder, output_file, chunk_files,
                               labels)
    tf.logging.info('Finished writing file: %s' % output_file)
    files.append(output_file)
  return files

In [10]:
train = pd.read_csv(path+'./CheXpert-v1.0-small/train.csv')
valid = pd.read_csv(path+'./CheXpert-v1.0-small/valid.csv')

train['validation'] = False
valid['validation'] = True
df = pd.concat([train, valid])

columns = ['Path', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion', 'validation']
df = df[columns]

for feature in ['Atelectasis', 'Edema']:
    df[feature] = df[feature].apply(lambda x: 1 if x==-1 else x)

for feature in ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']:
    df[feature] = df[feature].apply(lambda x: 0 if x==-1 else x)

In [11]:
df.fillna(0, inplace=True)

In [12]:
df.head()

Unnamed: 0,Path,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion,validation
0,CheXpert-v1.0-small/train/patient00001/study1/...,0.0,0.0,0.0,0.0,0.0,False
1,CheXpert-v1.0-small/train/patient00002/study2/...,1.0,0.0,0.0,1.0,0.0,False
2,CheXpert-v1.0-small/train/patient00002/study1/...,0.0,0.0,0.0,0.0,0.0,False
3,CheXpert-v1.0-small/train/patient00002/study1/...,0.0,0.0,0.0,0.0,0.0,False
4,CheXpert-v1.0-small/train/patient00003/study1/...,0.0,0.0,0.0,1.0,0.0,False


In [16]:
def convert_to_tf_records(_df, out_dir = './tfrecord_full'):
  """Convert the Imagenet dataset into TF-Record dumps."""

  # Shuffle training records to ensure we are distributing classes
  # across the batches.
  random.seed(0)
  def make_shuffle_idx(n):
    order = range(n)
    random.shuffle(order)
    return order
  
  columns = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
  train = _df[~_df.validation]
  training_files = train['Path'].tolist()
  training_labels = np.array(train[columns])
  
  valid = _df[_df.validation]
  validation_files = valid['Path'].tolist()
  validation_labels = np.array(valid[columns])
  
  # Create training data
  tf.logging.info('Processing the training data.')
  training_records = _process_dataset(
      training_files, training_labels, out_dir,
      TRAINING_DIRECTORY, TRAINING_SHARDS)

  # Create validation data
  tf.logging.info('Processing the validation data.')
  validation_records = _process_dataset(
      validation_files, validation_labels, out_dir,
      VALIDATION_DIRECTORY, VALIDATION_SHARDS)

  return training_records, validation_records

In [17]:
training_records, validation_records = convert_to_tf_records(_df=df)

KeyboardInterrupt: 

In [None]:
import tensorflow as tf

for example in tf.python_io.tf_record_iterator("./tfrecord_out/train-00000-of-00032"):
    result = tf.train.Example.FromString(example)
    print('l')
    break

In [None]:
result