# Notebook for Training ResNet on The TinyImageNet dataset Using Multiple GPUs in the Ring-All-Reduce Architecture. Notebook (3/4) in the End-to-End Scalable Deep Learning Pipeline on Hops.

This notebook will read the TFRecords that were written by notebook number 1 ([Notebook number one](./Step1_Save_to_Featurestore.ipynb)) and feed them into ResNet for multiple-GPU training with the ring-all-reduce architecture and the horovod library. 

Moreover it will read the hyperparameters produced by the distributed hyperparameter search in notebook number 2 ([Notebook number two](./Step2_Model_Training_Parallel_Experiments.ipynb))

The distributed/parallel training over several GPUs is performed with the ring-all-reduce architecture.

![step3.png](./../images/step3.png)

The ring-all-reduce architecture is network-optimal, utilizing all links between GPUs, rather than overloading a single link to a parameter server which easily can become a bottleneck.

## Package Imports


In [17]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from hops import experiment, tensorboard, featurestore, hdfs
from tensorflow import keras
from tensorflow.python.keras.callbacks import TensorBoard
import numpy as np
import math
import json
from tensorflow.keras import metrics
from tensorflow.keras.models import load_model

## Constants

In [18]:
HEIGHT = 64
WIDTH = 64
CHANNELS = 3
BATCH_SIZE = 100
SHUFFLE_BUFFER_SIZE = 1000
INPUT_SHAPE = 12288
NUM_CLASSES = 200
NUM_EPOCHS = 50
LEARNING_RATE = 0.001
TRAIN_DATASET = "train_dataset_tinyimagenet"
TEST_DATASET = "test_dataset_tinyimagenet"
VAL_DATASET = "val_dataset_tinyimagenet"
MODEL_NAME = "tinyimagenet_resnet50"
HYPERPARAMS_FILE = "tinyimagenet_best_hyperparams.json"
VALIDATION_RESULTS_FILE = "tinyimagenet_distributed_training_val_results.json"
MODEL_VERSION = 1

In [19]:
TRAIN_DATASET_SIZE = featurestore.get_training_dataset(TRAIN_DATASET).count()
STEPS_PER_EPOCH = int(math.floor(float(TRAIN_DATASET_SIZE)/float(BATCH_SIZE)))

In [20]:
%%local
HYPERPARAMS_FILE = "tinyimagenet_best_hyperparams.json"

## Define Model

In [21]:
def define_model():
    """
    Defines the model to use for image classification. 
    
    Uses a pre-defined ResNet50 implementation provided by Keras, but uses randomly initialized weights,
    i.e pre-defined but not pre-TRAINED.
    
    Returns:
           ResNet50 model
    """
    model = ResNet50(weights=None, input_shape=(HEIGHT, WIDTH, CHANNELS), classes=NUM_CLASSES)
    return model

## Define Model Input

In [34]:
def create_tf_dataset(dataset_name, shuffle_buffer_size, batch_size, num_epochs):
    """
    Creates a Tensorflow Dataset from TFRecords on HopsFS stored in the feature store.
    
    Args:
        :dataset_name: name of the training dataset in the featurestore (TFRecords format)
        :shuffle_buffer_size: size of the shuffle buffer in memory for shuffling the dataset
        :batch_size: the size of the batch
        :num_epochs: number of epochs to repeat the dataset
    
    Returns:
           Tensorflow dataset
    """
    
    # Get Path and Schema from feature store metadata
    tf_record_schema = featurestore.get_training_dataset_tf_record_schema(dataset_name)
    dataset_dir = featurestore.get_training_dataset_path(dataset_name)
    
    input_files = tf.gfile.Glob(dataset_dir + "/part-r-*")
    dataset = tf.data.TFRecordDataset(input_files)

    def decode(example_proto):
        example = tf.parse_single_example(example_proto, tf_record_schema)
        label_int = example["label"]
        image_flat = example["image"]
        image = tf.reshape(image_flat, (HEIGHT,WIDTH,CHANNELS))
        label = tf.one_hot(label_int, NUM_CLASSES)
        return image, label

    dataset = dataset.map(decode).batch(batch_size).shuffle(shuffle_buffer_size)
    dataset = dataset.repeat(num_epochs)
    # prefetch 1 batch to make bottleneck on GPU bandwidth less likely
    dataset = dataset.prefetch(1)
    return dataset

## Define Train Loop

In [41]:
def export_model(classifier, version):
    """
    Exports trained model 
    
    Args:
        :classifier: the model to export
        :version: version of the model to export
    """
    def _serving_input_receiver_fn():
        # key (e.g. 'examples') should be same with the inputKey when you 
        # buid the request for prediction
        receiver_tensors = {"input_1":tf.placeholder(dtype=tf.int64,shape=[1,HEIGHT,WIDTH,CHANNELS])}
        return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
    from hops import serving
    from hops import hdfs
    import os
    local_export_dir = os.getcwd()
    exported_path = classifier.export_savedmodel(local_export_dir, _serving_input_receiver_fn)
    exported_path = exported_path.decode("utf-8")
    serving.export(exported_path, MODEL_NAME, version)

In [42]:
def train_fn(learning_rate):
    """
    Defines the training loop:
    
    1. Get Model
    2. Define custom metrics
    3. Compile Model
    4. Convert Keras model to TF Estimator
    5. Fit model on train dataset
    6. Evaluate model on validation dataset
    7. Save validation results to HopsFS
    8. Export trained model for serving
    """
    # Tell Keras we are traning (in case it does different functionality between train/test time)
    tf.keras.backend.set_learning_phase(True)

    # 1. Get model
    print("Defning the model")
    model = define_model()
    print("Defining the model complete")
    
    # 2. Define custom metrics
    def top3_acc(x, y):
        return metrics.top_k_categorical_accuracy(x, y, k=3)
    
    def top5_acc(x, y):
        return metrics.top_k_categorical_accuracy(x, y, k=5)
    
    # 3. Compile the model
    print("Compiling the model")
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate), loss='categorical_crossentropy',  
                  metrics=['accuracy', metrics.mae,top3_acc,top5_acc])
    print("Compiling the model complete")
    
    # 4. Convert Keras model to TF Estimator
    # Define DistributionStrategies and convert the Keras Model to an
    # Estimator that utilizes these DistributionStrateges.
    # Evaluator is a single worker, so using MirroredStrategy.
    # Training is automatically distributed on all available GPUs when using MirroredStrategy
    print("Convert keras model to a Tensorflow Estimator")
    run_config = tf.estimator.RunConfig(
            train_distribute=tf.contrib.distribute.MirroredStrategy())
    keras_estimator = tf.keras.estimator.model_to_estimator(keras_model=model, 
               config=run_config, model_dir=tensorboard.logdir())
    print("Keras model to estimator conversion complete")
    
    
    # 5. Fit model on training dataset
    print("Starting training...")
    tf.estimator.train_and_evaluate(keras_estimator, train_spec=tf.estimator.TrainSpec(
        input_fn=lambda: create_tf_dataset(TRAIN_DATASET, SHUFFLE_BUFFER_SIZE, BATCH_SIZE, NUM_EPOCHS)),
        eval_spec=tf.estimator.EvalSpec(
            input_fn=lambda: create_tf_dataset(VAL_DATASET, SHUFFLE_BUFFER_SIZE, BATCH_SIZE, NUM_EPOCHS)))
    print("Training complete")
    
    # 6. Evalute model on validation dataset
    print("Evaluating model on validation dataset")
    eval_results = keras_estimator.evaluate(lambda: create_tf_dataset(VAL_DATASET, SHUFFLE_BUFFER_SIZE, BATCH_SIZE, NUM_EPOCHS))    
    val_top1acc = str(eval_results["accuracy"])
    val_top3acc = str(eval_results["top3_acc"])
    val_top5acc = str(eval_results["top5_acc"])
    validation_results = {
        "top1_acc": val_top1acc,
        "val_top3_acc": val_top3acc,
        "val_top5_acc": val_top5acc
    }
    print("Evaluation complete")
    
    # 7. Save validation results to HopsFS
    print("Saving validation results to HopsFS..")
    val_results_path = hdfs.project_path() + "Resources/" + VALIDATION_RESULTS_FILE 
    hdfs.dump(json.dumps(validation_results), val_results_path)
    print("Saving validation results complete")
    
    
    
    # 8. Exporting the trained model
    print("Exporting model...")
    export_model(keras_estimator, MODEL_VERSION)
    print("Model exported")
    return val_top1acc

## Distributed Training Experiments

To run this you should first have started your jupyter notebook server or job with the configuration "Distributed Training" and the MirroredStrategy and select the number of GPUs.

In [43]:
BATCH_SIZE = 100
NUM_EPOCHS = 1
LEARNING_RATE = 0.01
args_d = {}
args_d["learning_rate"] = [LEARNING_RATE]

In [44]:
experiment_result_path = experiment.launch(
    train_fn, 
    args_dict = args_d,
    name='tinyimagenet_resnet_distributed_training',
    description="Training TinyImageNet Using Distributed Training",
    local_logdir=True
)

Finished Experiment