# Efficient Ingesting


## Setup


In [None]:
import os
import time

import numpy as np
import matplotlib.pylab as plt

import tensorflow as tf
from tensorflow.keras import Sequential, layers
import tensorflow_hub as hub

In [None]:
print(f"Tensorflow version: {tf.version.VERSION}")
print(
    f"Built with GPU support? {'Yes!' if len(tf.config.list_logical_devices('GPU'))>0 else 'Noo!'}"
)
print(f"There are {len(tf.config.list_physical_devices('GPU'))} GPUs available.")

device_name = tf.test.gpu_device_name()
if device_name == "":
    raise SystemError("GPU device not found")
else:
    print(f"Found GPU at: {device_name}")

os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "COMPRESSED"

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

IMG_HEIGHT = 448
IMG_WIDTH = 448
IMG_CHANNELS = 3
CLASS_NAMES = "daisy dandelion roses sunflowers tulips".split()

TRAIN_URL = "gs://practical-ml-vision-book/flowers_tfr/train-0000[01]-*"
VALID_URL = "gs://practical-ml-vision-book/flowers_tfr/valid-*"

## Helper Function


In [None]:
def get_logdir():
    run_id = time.strftime("run_%Y%m%d-%H%M%S")
    return os.path.join("..", "..", "reports", "logs", "chapter_7_ingesting", run_id)

## Original Code


In [None]:
class _Preprocessor:
    def __init__(self):
        pass

    def read_from_tfr(self, proto):
        feature_description = {
            "image": tf.io.VarLenFeature(tf.float32),
            "shape": tf.io.VarLenFeature(tf.int64),
            "label": tf.io.FixedLenFeature([], tf.string, default_value=""),
            "label_int": tf.io.FixedLenFeature([], tf.int64, default_value=0),
        }
        record = tf.io.parse_single_example(proto, feature_description)
        shape = tf.sparse.to_dense(record["shape"])
        image = tf.reshape(tf.sparse.to_dense(record["image"]), shape)
        label_int = record["label_int"]
        return image, label_int

    def read_from_jpegfile(self, filename):
        image = tf.io.read_file(filename)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.convert_image_dtype(image, tf.float32)
        return image

    def preprocess(self, image):
        return tf.image.resize_with_pad(image, IMG_HEIGHT, IMG_WIDTH)

In [None]:
def create_preproc_dataset_plain(pattern):
    """Creates the dataset without parallelizing the process

    Args:
        pattern (string): Pattern of the files in Cloud Storage

    Returns:
        trainds: Dataset with the images and the labels
    """
    preproc = _Preprocessor()
    trainds = (
        tf.data.TFRecordDataset(
            [file for file in tf.io.gfile.glob(pattern)], compression_type="GZIP"
        )
        .map(preproc.read_from_tfr)
        .map(lambda img, label: (preproc.preprocess(img), label))
    )
    return trainds


def create_preproc_dataset_parallelmap(pattern):
    """Create the dataset from the TFRecord files, parallelizing the process

    Args:
        pattern (string): Pattern of the name of the files in Cloud Storage

    Returns:
        trainds: Dataset containing the images and the labels
    """
    preproc = _Preprocessor()

    def _preproc_image_label(img, label):
        return (preproc.preprocess(img), label)

    trainds = (
        tf.data.TFRecordDataset(
            [file for file in tf.io.gfile.glob(pattern)], compression_type="GZIP"
        )
        .map(preproc.read_from_tfr, num_parallel_calls=AUTOTUNE)
        .map(_preproc_image_label, num_parallel_calls=AUTOTUNE)
    )
    return trainds


def create_preproc_dataset_interleave(pattern, num_parallel=None):
    """Split the files into two halves and interleaves the datasets

    Args:
        pattern (string): Pattern of the files in the Cloud Storage
        num_parallel (int, optional): Number of parallel calls when mapping the records. Defaults to None.

    Returns:
        trainds: Dataset containing the images and the labels
    """
    preproc = _Preprocessor()
    files = [file for file in tf.io.gfile.glob(pattern)]
    if len(files) > 1:
        print(f"Interleaving the reading of {len(files)} files.")

        def _create_half_ds(x):
            if x == 0:
                half = files[: len(files) // 2]
            else:
                half = files[len(files) // 2 :]
            return tf.data.TFRecordDataset(half, compression_type="GZIP")

        trainds = tf.data.Dataset.range(2).interleave(
            _create_half_ds, num_parallel_calls=AUTOTUNE
        )

    else:
        trainds = tf.data.TFRecordDataset(files, compression_type="GZIP")

    def _preproc_image_label(image, label):
        return (preproc.preprocess(image), label)

    trainds = trainds.map(preproc.read_from_tfr, num_parallel_calls=num_parallel).map(
        _preproc_image_label, num_parallel_calls=num_parallel
    )

    return trainds


def create_preproc_image(filename):
    preproc = _Preprocessor()
    img = preproc.read_from_jpegfile(filename)
    return preproc.preprocess(image)


## Speeding up the reading of data

In [None]:
def loop_through_dataset(dataset, epochs):
    lowest_mean = tf.constant(1.0)
    for epoch in range(epochs):
        thresh = np.random.uniform(0.3, 0.7)
        count = 0
        sum_so_far = tf.constant(0.0)
        for (img, label) in dataset:
            mean = tf.reduce_mean(tf.where(img > thresh, img, 0))
            sum_so_far = sum_so_far + mean
            count += 1
            if count % 100 == 0:
                print(".", end="")
        mean = sum_so_far / count
        print(mean)
        if mean < lowest_mean:
            lowest_mean = mean

    return lowest_mean


In [None]:
NUM_EPOCHS = 3

In [None]:
%%time
dataset = create_preproc_dataset_plain(TRAIN_URL)
loop_through_dataset(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_preproc_dataset_parallelmap(TRAIN_URL)
loop_through_dataset(dataset, NUM_EPOCHS)

In [None]:
%%time 
dataset = create_preproc_dataset_interleave(TRAIN_URL, num_parallel=None)
loop_through_dataset(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_preproc_dataset_interleave(TRAIN_URL, num_parallel=AUTOTUNE)
loop_through_dataset(dataset, NUM_EPOCHS)

## ML Model

In [None]:
def train_simple_model(dataset, epochs):
    model = Sequential([
        layers.Flatten(input_shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)),
        layers.Dense(len(CLASS_NAMES), activation='softmax')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )
    model.fit(dataset, epochs=epochs)

In [None]:
%%time
dataset = create_preproc_dataset_plain(TRAIN_URL).batch(1)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_preproc_dataset_parallelmap(TRAIN_URL).batch(1)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_preproc_dataset_interleave(TRAIN_URL, num_parallel=None).batch(1)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_preproc_dataset_interleave(TRAIN_URL, num_parallel=AUTOTUNE).batch(1)
train_simple_model(dataset, NUM_EPOCHS)

## Speeding up the handling of data

In [None]:
def create_prepoc_dataset(pattern):
    return create_preproc_dataset_interleave(pattern, num_parallel=AUTOTUNE)

In [None]:
%%time
dataset = create_prepoc_dataset(TRAIN_URL).prefetch(AUTOTUNE).batch(1)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_prepoc_dataset(TRAIN_URL).prefetch(AUTOTUNE).batch(8)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_prepoc_dataset(TRAIN_URL).prefetch(AUTOTUNE).batch(16)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_prepoc_dataset(TRAIN_URL).prefetch(AUTOTUNE).batch(32)
train_simple_model(dataset, NUM_EPOCHS)

In [None]:
%%time
dataset = create_prepoc_dataset(TRAIN_URL).cache().prefetch(AUTOTUNE).batch(32)
train_simple_model(dataset, NUM_EPOCHS)