# Exercise07 : Hyperparameter Tuning

AML provides framework-independent hyperparameter tuning capability.    
This capability monitors accuracy in AML logs.

*back to [index](https://github.com/tsmatz/azureml-tutorial-tensorflow-v1/)*

## Save your training code

First, you must save your training code.    
Here we should use the source code in "[Exercise06 : Experimentation Logs and Outputs](./exercise06_experimentation.ipynb)", which sends logs periodically into AML run history.

Create ```scirpt``` directory.

In [1]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

Save source code as ```./script/train_expriment.py```.

In [2]:
%%writefile script/train_experiment.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

from azureml.core.run import Run

# Get run when running in remote
if 'run' not in locals():
    run = Run.get_context()

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        # Ask for accuracy and loss in each steps
        class _CustomLoggingHook(tf.train.SessionRunHook):
            def begin(self):
                self.training_accuracy = []
                self.training_loss = []
            def before_run(self, run_context):
                return tf.train.SessionRunArgs([accuracy[1], loss, global_step])
            def after_run(self, run_context, run_values):
                result_accuracy, result_loss, result_step = run_values.results
                if result_step % 10 == 0 :
                    self.training_accuracy.append(result_accuracy)
                    self.training_loss.append(result_loss)
                if result_step % 100 == 0 : # save logs in each 100 steps
                    run.log_list('training_accuracy', self.training_accuracy)
                    run.log_list('training_loss', self.training_loss)
                    self.training_accuracy = []
                    self.training_loss = []
        return tf.estimator.EstimatorSpec(
            mode,
            training_chief_hooks=[_CustomLoggingHook()],
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.estimator.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
eval_res = tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

# send logs to AML
run.log('learning_rate', FLAGS.learning_rate)
run.log('1st_layer', FLAGS.first_layer)
run.log('2nd_layer', FLAGS.second_layer)
run.log('final_accuracy', eval_res[0]['accuracy'])
run.log('final_loss', eval_res[0]['loss'])

Writing script/train_experiment.py


## Get workspace setting

Before starting, you must read your configuration settings. (See "[Exercise01 : Prepare Config Settings](./exercise01_prepare_config.ipynb)".)

In [3]:
from azureml.core import Workspace
import azureml.core

ws = Workspace.from_config()

## Create AML compute

Create AML compute pool for computing environment.

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

try:
    compute_target = ComputeTarget(workspace=ws, name='hypertest01')
    print('found existing:', compute_target.name)
except ComputeTargetException:
    print('creating new.')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='Standard_D2_v2',
        min_nodes=0,
        max_nodes=4)
    compute_target = ComputeTarget.create(ws, 'hypertest01', compute_config)
    compute_target.wait_for_completion(show_output=True)

creating new.
InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
# get a status for the current cluster.
print(compute_target.status.serialize())

{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-07-12T02:44:31.432000+00:00', 'errors': None, 'creationTime': '2021-07-12T02:44:08.484005+00:00', 'modifiedTime': '2021-07-12T02:44:34.482116+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


## Prepare Dataset

You can mount your dataset (See "[Exercise02 : Prepare Data](./exercise02_prepare_data.ipynb)") into your AML compute.<br>
Now we get the registered dataset reference.

In [6]:
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, 'mnist_tfrecords_dataset', version='latest')

# # For using unregistered data, see below
# from azureml.core import Datastore
# from azureml.core import Dataset
# ds = ws.get_default_datastore()
# ds_paths = [(ds, 'tfdata/')]
# dataset = Dataset.File.from_files(path = ds_paths)

## Generate Hyperparameter Sampling

Set how to explorer for script's arguments (the arguments in ```train_experiment.py```).<br>
You can choose from ```GridParameterSampling```, ```RandomParameterSampling```, and ```BayesianParameterSampling```.

In [7]:
from azureml.train.hyperdrive import *

param_sampling = RandomParameterSampling(
    {
        '--learning_rate': choice(0.01, 0.05, 0.9),
        '--first_layer': choice(100, 125, 150),
        '--second_layer': choice(30, 60, 90)
    }
)

## Generate script run config

In [8]:
from azureml.core import Environment, Experiment, ScriptRunConfig

# generate script run config
tf_env = Environment.get(workspace=ws, name='AzureML-TensorFlow-1.13-CPU')
src = ScriptRunConfig(
    source_directory='./script',
    script='train_experiment.py',
    arguments=['--data_folder', dataset.as_mount()],
    compute_target=compute_target,
    environment=tf_env
)

## Generate HyperDrive config

Generate run config with an early termnination policy (```BanditPolicy```). With this policy, the training will terminate if the primary metric falls outside of the top 10% range (checking every 2 iterations).

In [9]:
# early termnination :
# primary metric falls outside of the top 10% (0.1) range by checking every 2 iterations
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# generate run config
hd_config = HyperDriveConfig(
    run_config=src,
    hyperparameter_sampling=param_sampling,
    primary_metric_name='training_accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
    policy=policy,
    max_total_runs=20,
    max_concurrent_runs=4)

## Run script and wait for completion

This will start training with 4 parallel nodes. (You can scale as you like.)

In [10]:
from azureml.core import Experiment

experiment = Experiment(workspace=ws, name='hyperdrive_test')
run = experiment.submit(config=hd_config)
run.wait_for_completion(show_output=True)

RunId: HD_ec00753d-b579-44c3-9bad-de871bf8298f
Web View: https://ml.azure.com/runs/HD_ec00753d-b579-44c3-9bad-de871bf8298f?wsid=/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourcegroups/TEST20210712/workspaces/ws01&tid=72f988bf-86f1-41af-91ab-2d7cd011db47

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-07-12T03:18:20.164025][API][INFO]Experiment created<END>\n""<START>[2021-07-12T03:18:20.783424][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-07-12T03:18:21.255626][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_ec00753d-b579-44c3-9bad-de871bf8298f
Web View: https://ml.azure.com/runs/HD_ec00753d-b579-44c3-9bad-de871bf8298f?wsid=/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourcegroups/TEST20210712/workspaces/ws01&tid=72f988bf-86f1-41af-91ab-2d7cd011db47



{'runId': 'HD_ec00753d-b579-44c3-9bad-de871bf8298f',
 'target': 'hypertest01',
 'status': 'Completed',
 'startTimeUtc': '2021-07-12T03:18:19.778295Z',
 'endTimeUtc': '2021-07-12T03:27:04.682514Z',
 'properties': {'primary_metric_config': '{"name": "training_accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '5d5a755b-a30b-4c6c-bbeb-2e3f98bbc7fe',
  'score': '0.8945545554161072',
  'best_child_run_id': 'HD_ec00753d-b579-44c3-9bad-de871bf8298f_11',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://ws010492426588.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_ec00753d-b579-44c3-9bad-de871bf8298f/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=VQroknzRFLUXQtsjrnCcyh%2BVZzk7fcsnS0P%2F5sULEkE%3D&st=2021-07-12T03%3A17%3A12Z&se=2021-07-12T11%3A27%3A12Z&sp=r'},
 'submittedBy': 'Tsuyos

## View logs

You can view logs and metrics in Experiments on [Azure ML studio UI](https://ml.azure.com/).

![AML Hyperdrive Metrics](https://tsmatz.github.io/images/github/azure-ml-tensorflow-complete-sample/20220225_Hyperdrive_Metrics.jpg)

In your notebook, you can also view using AML run history widget as follows.

In [11]:
from azureml.widgets import RunDetails
RunDetails(run_instance=run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

You can also explorer metrics with your python code.

In [12]:
allmetrics = run.get_metrics()
print(allmetrics)

{'HD_ec00753d-b579-44c3-9bad-de871bf8298f_17': {'training_accuracy': [0.15800000727176666, 0.1574999988079071, 0.164000004529953, 0.17874999344348907, 0.19859999418258667, 0.2201666682958603, 0.24500000476837158, 0.2686249911785126, 0.28644445538520813, 0.2989000082015991, 0.3137272596359253, 0.328083336353302, 0.34184616804122925, 0.35428571701049805, 0.36453333497047424, 0.37406250834465027, 0.38370588421821594, 0.3915555477142334, 0.39794737100601196, 0.4038499891757965, 0.4124285578727722, 0.4189545512199402, 0.42543476819992065, 0.4321250021457672, 0.4369199872016907, 0.4426538348197937, 0.4475925862789154, 0.4525357186794281, 0.4594137966632843, 0.46533334255218506, 0.47148385643959045, 0.47715625166893005, 0.4821818172931671, 0.488029420375824, 0.4925428628921509, 0.49816668033599854, 0.5030270218849182, 0.5077105164527893, 0.5131282210350037, 0.5184000134468079, 0.5231951475143433, 0.5285476446151733, 0.533418595790863, 0.5388181805610657, 0.5428444147109985, 0.5473478436470032

## Remove AML compute

In [13]:
# Delete cluster (nbodes) and remove from AML workspace
mycompute = AmlCompute(workspace=ws, name='hypertest01')
mycompute.delete()