# List files from Cloud Storage

In [1]:
import tensorflow as tf

In [2]:
tf.enable_eager_execution()

In [3]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [4]:
from google.cloud import storage
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
import numpy as np

In [6]:
def list_blobs(bucket_name, prefix):
    """Lists all the blobs in the bucket."""
    images_paths = []

    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)

    for blob in blobs:
        images_paths.append(blob.name)
        
    return images_paths

In [None]:
prefix = 'datasets/xray-chest-nih/images'
bucket_name = 'tensorflow-samples'

In [None]:
images_paths = list_blobs(bucket_name, prefix)

# Filter Images Paths to create Dataset

In [7]:
images_prefix = 'gs://tensorflow-samples/datasets/xray-chest-nih/images/'

In [8]:
labels_path = 'gs://tensorflow-samples/datasets/xray-chest-nih/labels/labels.csv'
df_files = pd.read_csv(labels_path)

In [9]:
# Filter images with label Effusion
effusion_images = df_files[df_files['Finding Labels'].str.contains('Effusion')]

# Keep only image name and label
effusion_images = effusion_images[['Image Index','Finding Labels']]
effusion_images['Image Index'] = images_prefix + effusion_images['Image Index']
effusion_images['Finding Labels'] = 1

In [10]:
# Filter images with label No Finding
nofindings_images = df_files[df_files['Finding Labels'].str.contains('No Finding')]

# Keep only image name and label
nofindings_images = nofindings_images[['Image Index','Finding Labels']]
nofindings_images['Image Index'] = images_prefix + nofindings_images['Image Index']
nofindings_images['Finding Labels'] = 0

In [11]:
# Only a subset of the main dataset
nofindings_images = nofindings_images.iloc[:20000]

#### Let's concatenate the lists

In [14]:
effusion_range = len(effusion_images)
nofindings_range = len(nofindings_images)

Train Images and Labels

In [17]:
train_images = list(effusion_images['Image Index'][:int(effusion_range*0.90)])
train_images = train_images + list(nofindings_images['Image Index'][:int(nofindings_range*0.90)])

In [18]:
train_labels = list(effusion_images['Finding Labels'][:int(effusion_range*0.90)]) 
train_labels = train_labels + list(nofindings_images['Finding Labels'][:int(nofindings_range*0.90)])

Val Images and Labels

In [19]:
val_images = list(effusion_images['Image Index'][int(effusion_range*0.90):int(effusion_range*0.97)])
val_images = val_images + list(nofindings_images['Image Index'][int(nofindings_range*0.90):int(nofindings_range*0.97)])

In [20]:
val_labels = list(effusion_images['Finding Labels'][int(effusion_range*0.90):int(effusion_range*0.97)]) 
val_labels = val_labels + list(nofindings_images['Finding Labels'][int(nofindings_range*0.90):int(nofindings_range*0.97)])

Test Images and Labels

In [21]:
test_images = list(effusion_images['Image Index'][int(effusion_range*0.97):])
test_images = test_images + list(nofindings_images['Image Index'][int(nofindings_range*0.97):])

In [22]:
test_labels = list(effusion_images['Finding Labels'][int(effusion_range*0.97):]) 
test_labels = test_labels + list(nofindings_images['Finding Labels'][int(nofindings_range*0.97):])

In [36]:
print(len(train_images))
print(len(val_images))
print(len(test_images))

29985
2332
1000


# Generate TFRecord with images

In [42]:
# First let's create a tf.data.dataset from file paths
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(buffer_size=len(train_images))
val_dataset = tf.data.Dataset.from_tensor_slices((val_images, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels))

In [43]:
def process_image_tfrecord(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image)
    image = tf.image.resize(image, [512,512], method='nearest')
    image = tf.expand_dims(image[:,:,0], -1)
    image = tf.image.encode_png(image)
    
    return image, label

In [44]:
def build_dataset_tfrecord(paths_dataset):
    dataset = paths_dataset.map(process_image_tfrecord, num_parallel_calls=AUTOTUNE)    
    return dataset

In [45]:
def tf_serialize_example(image, label):
    
    def _bytes_feature(value):
        """Returns a bytes_list from a string / byte."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _float_feature(value):
        """Returns a float_list from a float / double."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))    
    
    def serialize_example(image, label):
        
        feature = {
            'image': _bytes_feature(image),
            'label': _int64_feature(label)
        }

        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        
        return example_proto.SerializeToString()
    
    tf_string = serialize_example(image, label)

    return tf_string

In [46]:
# Create TFRecord with `n_shards` shards (GZIP)
def create_tfrecord(ds, n_shards, name):

    for i in range(n_shards):
        batch = map(lambda x: tf_serialize_example(x[0],x[1]), ds.shard(n_shards, i)
                    .apply(build_dataset_tfrecord)
                    .make_one_shot_iterator())
        
        with tf.io.TFRecordWriter('{name}-output_file-part-{i}.tfrecord'.format(i=i, name=name), 'GZIP') as writer:
            print('Creating TFRecord ... output_file-part-{i}.tfrecord'.format(i=i))
            for a in batch:
                writer.write(a)

In [None]:
create_tfrecord(train_dataset, 30, 'train')
create_tfrecord(val_dataset, 6, 'val')
create_tfrecord(test_dataset, 4, 'test')

In [None]:
# !gsutil -m cp train*.tfrecord gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/train/
!gsutil -m cp val*.tfrecord gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/val/
!gsutil -m cp test*.tfrecord gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/test/

# Consume TFRecords and create tf.data.Dataset

In [215]:
TRAIN_TFRECORDS = 'gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/train/*'
VAL_TFRECORDS = 'gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/val/*'
TEST_TFRECORDS = 'gs://tensorflow-samples/datasets/xray-chest-nih/tfrecords/test/*'
BATCH_SIZE = 32
STEPS_TRAIN = int(len(train_images)/32)
STEPS_VAL = int(len(val_images)/32)

In [120]:
@tf.function
def parse_function(example_proto):
    # Parse the input `tf.Example` proto using the dictionary above.
    
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    
    return tf.io.parse_example(example_proto, feature_description)

In [121]:
@tf.function
def process_image(record):
    image = tf.map_fn(tf.io.decode_png, record['image'], dtype=tf.uint8)
    image = tf.map_fn(lambda image: 
                      tf.image.convert_image_dtype(image, dtype=tf.float32), image, dtype=tf.float32)
    
    label = record['label']
    
    return image, label

In [136]:
# Convert grayscale => RGB to use InceptionV3
# NOT USED
@tf.function
def grayscale_to_rgb(images, labels):
    images = tf.image.grayscale_to_rgb(images)
    return images, labels

In [123]:
@tf.function
def get_tfrecord(filename):
    return tf.data.TFRecordDataset(filename, compression_type='GZIP', 
                                   num_parallel_reads=AUTOTUNE)

In [137]:
def build_dataset(dataset, batch_size=BATCH_SIZE):
    
    dataset = dataset.interleave(get_tfrecord, num_parallel_calls=AUTOTUNE)

    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)

    dataset = dataset.repeat()
    # Pipeline next iteration
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [138]:
train_files= tf.io.gfile.glob(TRAIN_TFRECORDS)
val_files= tf.io.gfile.glob(VAL_TFRECORDS)
test_files= tf.io.gfile.glob(TEST_TFRECORDS)

train_filenames_dataset = tf.data.Dataset.from_tensor_slices(train_files)
val_filenames_dataset = tf.data.Dataset.from_tensor_slices(val_files)
test_filenames_dataset = tf.data.Dataset.from_tensor_slices(test_files)

In [139]:
train_dataset = build_dataset(train_filenames_dataset)
val_dataset = build_dataset(val_filenames_dataset)
test_dataset = build_dataset(test_filenames_dataset)

# Train Model

In [180]:
IMG_SHAPE = (512,512,1)

In [181]:
img_inputs = tf.keras.Input(shape=IMG_SHAPE)

In [182]:
# 1th group
x = tf.keras.layers.Conv2D(64, (5,5), use_bias=False)(img_inputs)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.MaxPool2D((3,3),(2,2))(x)

In [183]:
# 2nd group
x = tf.keras.layers.Conv2D(128, (5,5), use_bias=False)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.MaxPool2D((3,3),(2,2))(x)

In [184]:
# 3rd group
x = tf.keras.layers.Conv2D(256, (5,5), use_bias=False)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.MaxPool2D((3,3),(2,2))(x)

In [185]:
# 4th group
x = tf.keras.layers.Conv2D(512, (5,5), use_bias=False)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.MaxPool2D((3,3),(2,2))(x)

In [186]:
# 5th group
x = tf.keras.layers.GlobalAvgPool2D()(x)

In [187]:
# Classification
outputs = tf.keras.layers.Dense(1)(x)

In [190]:
model = tf.keras.Model(img_inputs, outputs)

In [192]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 512, 512, 1)]     0         
_________________________________________________________________
conv2d_204 (Conv2D)          (None, 508, 508, 64)      1600      
_________________________________________________________________
batch_normalization_204 (Bat (None, 508, 508, 64)      256       
_________________________________________________________________
activation_202 (Activation)  (None, 508, 508, 64)      0         
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 253, 253, 64)      0         
_________________________________________________________________
conv2d_205 (Conv2D)          (None, 249, 249, 128)     204800    
_________________________________________________________________
batch_normalization_205 (Bat (None, 249, 249, 128)     512   

In [218]:
def scheduler(epoch):
    if epoch < 6:
        return epoch*0.1*(32/512)/5
    elif epoch > 6 and epoch < 30:
        return 0.1*(32/512)/5
    elif epoch > 30 and epoch < 60:
        return (0.1*(32/512)/5)/10
    elif epoch > 60 and epoch < 90:
        return (0.1*(32/512)/5)/100

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [219]:
model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              callbacks=[callback],
              metrics=['accuracy'])

In [220]:
history = model.fit(train_dataset, epochs=1, 
                   validation_data=val_dataset,
                   steps_per_epoch=STEPS_TRAIN,
                   validation_steps=STEPS_VAL)

Train for 937 steps, validate for 72 steps
122/937 [==>...........................] - ETA: 13:41 - loss: 0.6524 - acc: 0.6335

KeyboardInterrupt: 