# Step ONE

Read images one by one from a Google Cloud Storage bucket (gs).

Note that this may not be very efficient as we have a lot of small files to read (random reads).

In [1]:
import tensorflow as tf

In [2]:
from numpy import zeros
import numpy as np
from datetime import datetime

In [3]:
# Enable XLA jit graph compilation
# Performance gains for fixed size images
tf.config.optimizer.set_jit(True)

In [38]:
SOURCE = 'gs://renatoleite-tf-datapipeline-poc/*/*'
RESOLUTION = (224,224)
NUM_TOTAL_IMAGES = 20500
IMG_SHAPE=(224,224,3)

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [5]:
# Get labels from folders
path = 'gs://renatoleite-tf-datapipeline-poc/*'
folders_name = tf.io.gfile.glob(path)

labels = []
for folder in folders_name:
    labels.append(folder.split(sep='/')[-1])

In [6]:
# Generate a Label Map
label_map = {labels[i]:i for i in range(len(labels))}
inv_label_map = {i:labels[i] for i in range(len(labels))}

In [7]:
# List all files in bucket
filepath = 'gs://renatoleite-tf-datapipeline-poc/*/*'
filepath = tf.io.gfile.glob(filepath)

In [8]:
# Function to One hot encode the inputs
def one_hot_encode(label_map, filepath):
    dataset = dict()
    
    for i in range(len(filepath)):
        encoding = zeros(len(label_map), dtype='uint8')
        encoding[label_map[filepath[i].split(sep='/')[-2]]] = 1
        
        dataset.update({filepath[i]:list(encoding)})
    
    return dataset

In [9]:
dataset = one_hot_encode(label_map, filepath)
dataset = [[k,v] for k,v in dataset.items()]

features = [i[0] for i in dataset]
labels = [i[1] for i in dataset]

In [10]:
# Create Dataset from Features and Labels
dataset = tf.data.Dataset.from_tensor_slices((features, labels))

In [11]:
# Function to download bytes from Cloud Storage
def get_bytes_label(filepath, label):
    raw_bytes = tf.io.read_file(filepath)
    return raw_bytes, label

In [37]:
# Preprocess Image
def process_image(raw_bytes, label):
    image = tf.io.decode_jpeg(raw_bytes, channels=3)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, [224,224])
    
    return image, label

In [13]:
def build_dataset(dataset, batch_size=32):
    dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
    
    # Extraction: IO Intensive
    dataset = dataset.map(get_bytes_label, num_parallel_calls=AUTOTUNE)

    # Transformation: CPU Intensive
    dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size=batch_size)
    
    # Pipeline next iteration
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [34]:
# Start tracing execution
tf.summary.trace_on(profiler=True)

In [15]:
dataset = build_dataset(dataset)

In [19]:
# Define Model
base_model = tf.keras.applications.ResNet50V2(weights='imagenet', 
                                         input_shape=IMG_SHAPE,
                                         include_top=False)

In [20]:
base_model.trainable = False

In [22]:
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(label_map))
])

In [23]:
model_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(optimizer=tf.keras.optimizers.Adam(),
             loss=model_loss,
             metrics=['accuracy'])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50v2 (Model)           (None, 7, 7, 2048)        23564800  
_________________________________________________________________
flatten_1 (Flatten)          (None, 100352)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               51380736  
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 120)               61560     
Total params: 75,007,096
Trainable params: 51,442,296
Non-trainable params: 23,564,800
_________________________________________________________________


In [25]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Scheduler to analise Learning Rate Decay

#lr_scheduler = tf.keras.callbacks.LearningRateScheduler(
#    lambda epoch: 1e-8 * 10**(epoch / 20))

In [35]:
history = model.fit(dataset,
              epochs=1, 
              callbacks=[tensorboard_callback], 
              steps_per_epoch=644)

Train for 644 steps


In [36]:
def read_one_image(filepath):
    image = tf.io.read_file(filepath)
    image = tf.io.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, RESOLUTION)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.expand_dims(image,0)
    
    return image

In [40]:
filepath = 'gs://renatoleite-tf-datapipeline-poc/n02085782-Japanese_spaniel/n02085782_668.jpg'
dog = read_one_image(filepath)

In [41]:
model.predict_classes([dog])

array([81])

In [43]:
inv_label_map[81]

'n02106166-Border_collie'

In [None]:
# Stop tracing execution
tf.summary.trace_export(name='Loading Data', profiler_outdir='/home/jupyter/logs/')

In [50]:
from tensorboard import notebook
notebook.list() # View open TensorBoard instances

Known TensorBoard instances:
  - port 6008: logdir /home/jupyter/tensorflow-data-pipeline/logs/fit (started 0:05:06 ago; pid 18019)
  - port 6006: logdir /home/jupyter/tensorflow-data-pipeline/logs (started 21:51:12 ago; pid 2297)
  - port 6007: logdir /home/jupyter/tensorflow-data-pipeline/logs (started 5:35:05 ago; pid 13135)


In [None]:
notebook.display(port=6006, height=1000)