# Building High Performance Data Pipelines with tf.Data and Google Cloud Storage

This article presents some recipes on how to build a high performance input pipeline using Tensorflow tf.data and Google Cloud Storage.
The concepts and techniques are presented in a ....

This article uses the Stanford Dogs Dataset with ~20000 images and 120 classes.

## Setup the environment

In [1]:
# Benchmark function for dataset
import time
default_timeit_steps = 1000

def timeit(ds, steps=default_timeit_steps):
  start = time.time()
  it = iter(ds)
  for i in range(steps):
    batch = next(it)
    if i%10 == 0:
      print('.',end='')
  print()
  end = time.time()

  duration = end-start
  print("{} batches: {} s".format(steps, duration))
  print("{:0.5f} Images/s".format(BATCH_SIZE*steps/duration))

In [4]:
# First let's import Tensorflow
import tensorflow as tf

In [7]:
# Now import some additional libraries
from numpy import zeros
import numpy as np
from datetime import datetime

In [8]:
# Global variables
FILENAMES = 'gs://renatoleite-tf-datapipeline-poc/*/*'
FOLDERS = 'gs://renatoleite-tf-datapipeline-poc/*'

RESOLUTION = (224,224)
NUM_TOTAL_IMAGES = 20500
IMG_SHAPE=(224,224,3)

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [9]:
# Get labels from folders name
def get_label_map(path):
    #list folders in this path
    folders_name = tf.io.gfile.glob(path)

    labels = []
    for folder in folders_name:
        labels.append(folder.split(sep='/')[-1])

    # Generate a Label Map
    label_map = {labels[i]:i for i in range(len(labels))}
    inv_label_map = {i:labels[i] for i in range(len(labels))}
    
    return label_map, inv_label_map

In [10]:
# Function to One hot encode the inputs
def one_hot_encode(label_map, filepath):
    dataset = dict()
    
    for i in range(len(filepath)):
        encoding = zeros(len(label_map), dtype='uint8')
        encoding[label_map[filepath[i].split(sep='/')[-2]]] = 1
        
        dataset.update({filepath[i]:list(encoding)})
    
    return dataset

In [11]:
label_map, inv_label_map = get_label_map(FOLDERS)

In [12]:
# List all files in bucket
filepath = tf.io.gfile.glob(FILENAMES)
NUM_TOTAL_IMAGES = len(filepath)

In [13]:
dataset = one_hot_encode(label_map, filepath)
dataset = [[k,v] for k,v in dataset.items()]

features = [i[0] for i in dataset]
labels = [i[1] for i in dataset]

In [14]:
# Create Dataset from Features and Labels
dataset = tf.data.Dataset.from_tensor_slices((features, labels))

In [15]:
# Function to download bytes from Cloud Storage
def get_bytes_label(filepath, label):
    raw_bytes = tf.io.read_file(filepath)
    return raw_bytes, label

In [16]:
# Preprocess Image
def process_image(raw_bytes, label):
    image = tf.io.decode_jpeg(raw_bytes, channels=3)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, [224,224])
    
    return image, label

In [17]:
def build_dataset(dataset, batch_size=32):
    dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
    
    # Extraction: IO Intensive
    dataset = dataset.map(get_bytes_label, num_parallel_calls=AUTOTUNE)

    # Transformation: CPU Intensive
    dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size=batch_size)
    
    # Pipeline next iteration
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [18]:
dataset = build_dataset(dataset)

