# Step ONE

Read images one by one from a Google Cloud Storage bucket (gs).

Note that this may not be very efficient as we have a lot of small files to read (random reads).

In [1]:
import tensorflow as tf

In [2]:
from numpy import zeros
import numpy as np
from datetime import datetime

In [3]:
# Enable XLA jit graph compilation
# Performance gains for fixed size images
tf.config.optimizer.set_jit(True)

In [4]:
SOURCE = 'gs://renatoleite-tf-datapipeline-poc/*/*'
RESOLUTION = (224,224)
NUM_TOTAL_IMAGES = 20000
IMG_SHAPE=(224,224,3)

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [5]:
# Get labels from folders
path = 'gs://renatoleite-tf-datapipeline-poc/*'
folders_name = tf.io.gfile.glob(path)

labels = []
for folder in folders_name:
    labels.append(folder.split(sep='/')[-1])

In [6]:
# Generate a Label Map
label_map = {labels[i]:i for i in range(len(labels))}

In [7]:
# List all files in bucket
filepath = 'gs://renatoleite-tf-datapipeline-poc/*/*'
filepath = tf.io.gfile.glob(filepath)

In [8]:
# Function to One hot encode the inputs
def one_hot_encode(label_map, filepath):
    dataset = dict()
    
    for i in range(len(filepath)):
        encoding = zeros(len(label_map), dtype='uint8')
        encoding[label_map[filepath[i].split(sep='/')[-2]]] = 1
        
        dataset.update({filepath[i]:list(encoding)})
    
    return dataset

In [9]:
dataset = one_hot_encode(label_map, filepath)
dataset = [[k,v] for k,v in dataset.items()]

features = [i[0] for i in dataset]
labels = [i[1] for i in dataset]

In [10]:
# Create Dataset from Features and Labels
dataset = tf.data.Dataset.from_tensor_slices((features, labels))

In [11]:
# Function to download bytes from Cloud Storage
def get_bytes_label(filepath, label):
    raw_bytes = tf.io.read_file(filepath)
    return raw_bytes, label

In [12]:
# Preprocess Image
def process_image(raw_bytes, label):
    image = tf.io.decode_jpeg(raw_bytes, channels=3)
    image = tf.image.resize(image, RESOLUTION)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    
    return image, label

In [13]:
def build_dataset(dataset, batch_size=32):
    dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
    
    # Extraction: IO Intensive
    dataset = dataset.map(get_bytes_label, num_parallel_calls=AUTOTUNE)

    # Transformation: CPU Intensive
    dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size=batch_size)
    
    # Pipeline next iteration
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [60]:
# Start tracing execution
tf.summary.trace_on(profiler=True)

In [15]:
dataset = build_dataset(dataset)

In [16]:
# Define Model
base_model = tf.keras.applications.ResNet50V2(weights='imagenet', 
                                         input_shape=IMG_SHAPE,
                                         include_top=False)

In [17]:
base_model.trainable = False

In [18]:
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_map))
])

In [19]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.01),
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50v2 (Model)           (None, 7, 7, 2048)        23564800  
_________________________________________________________________
flatten (Flatten)            (None, 100352)            0         
_________________________________________________________________
dense (Dense)                (None, 64)                6422592   
_________________________________________________________________
dense_1 (Dense)              (None, 120)               7800      
Total params: 29,995,192
Trainable params: 6,430,392
Non-trainable params: 23,564,800
_________________________________________________________________


In [21]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [22]:
model.fit(dataset, epochs=2, callbacks=[tensorboard_callback], steps_per_epoch=644)

Train for 644 steps
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f0248455610>

In [51]:
def read_one_image(filepath):
    image = tf.io.read_file(filepath)
    image = tf.io.decode_jpeg(image)
    
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (224,224))
    image = tf.expand_dims(image, 0)
    
    return image

In [52]:
filepath = '/home/jupyter/dog2.jpg'
dog = read_one_image(filepath)

In [54]:
model.predict_classes(dog)

array([2])

In [58]:
predict_dog

<tf.Tensor: shape=(1, 120), dtype=float32, numpy=
array([[-4.7049637, -4.7844744, -4.3209066, -4.707664 , -4.512874 ,
        -4.470879 , -4.6343493, -4.6655245, -4.7251616, -4.461989 ,
        -4.5652614, -4.4336233, -4.496088 , -4.784021 , -4.755663 ,
        -4.796954 , -4.883999 , -4.7736945, -4.8929534, -4.4629025,
        -4.726665 , -4.653229 , -4.633904 , -4.5716534, -4.7087483,
        -4.5384765, -4.409237 , -4.74601  , -4.734668 , -4.7057376,
        -4.6125097, -4.8233166, -4.7765408, -4.6572323, -4.623863 ,
        -4.548248 , -4.6547174, -4.8636007, -4.4565043, -4.438702 ,
        -4.431171 , -4.50789  , -4.7174845, -4.6180997, -4.768159 ,
        -4.675538 , -4.6620765, -4.7656054, -4.761192 , -4.468682 ,
        -4.6717873, -4.750771 , -4.83908  , -4.648381 , -4.748275 ,
        -4.833719 , -4.8652806, -4.6325874, -4.716644 , -4.8483562,
        -4.6835585, -4.6527214, -4.845661 , -4.6880455, -4.618646 ,
        -4.9034214, -4.749999 , -4.7604537, -4.822083 , -4.9187546

In [27]:
label_map

{'n02085620-Chihuahua': 0,
 'n02085782-Japanese_spaniel': 1,
 'n02085936-Maltese_dog': 2,
 'n02086079-Pekinese': 3,
 'n02086240-Shih-Tzu': 4,
 'n02086646-Blenheim_spaniel': 5,
 'n02086910-papillon': 6,
 'n02087046-toy_terrier': 7,
 'n02087394-Rhodesian_ridgeback': 8,
 'n02088094-Afghan_hound': 9,
 'n02088238-basset': 10,
 'n02088364-beagle': 11,
 'n02088466-bloodhound': 12,
 'n02088632-bluetick': 13,
 'n02089078-black-and-tan_coonhound': 14,
 'n02089867-Walker_hound': 15,
 'n02089973-English_foxhound': 16,
 'n02090379-redbone': 17,
 'n02090622-borzoi': 18,
 'n02090721-Irish_wolfhound': 19,
 'n02091032-Italian_greyhound': 20,
 'n02091134-whippet': 21,
 'n02091244-Ibizan_hound': 22,
 'n02091467-Norwegian_elkhound': 23,
 'n02091635-otterhound': 24,
 'n02091831-Saluki': 25,
 'n02092002-Scottish_deerhound': 26,
 'n02092339-Weimaraner': 27,
 'n02093256-Staffordshire_bullterrier': 28,
 'n02093428-American_Staffordshire_terrier': 29,
 'n02093647-Bedlington_terrier': 30,
 'n02093754-Border_terr

In [61]:
# Stop tracing execution
tf.summary.trace_export(name='Loading Data', profiler_outdir='/home/jupyter/logs/')

In [62]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir /home/jupyter/tensorflow-data-pipeline/logs

In [None]:
print(label_map)