# Install SageMaker Experiments

In [None]:
!pip install sagemaker-experiments

# Simple Experiments

In [None]:
from sagemaker import get_execution_role
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
import time
from sagemaker.tensorflow import TensorFlow

role = get_execution_role()
input_uri = 's3://sagemaker-us-east-1-233037139193/mbp3/dataset/dataset.pkl.gz'

## Experiments creation

In [None]:
my_experiment = Experiment.create(experiment_name="my-private-exp3",
                                  description="It's private")
print(my_experiment)

In [None]:
%%writefile mnist_softmax.py
import tensorflow as tf
import argparse
import os, time
import numpy as np
import json
import gzip, pickle

if __name__ == "__main__":
    
    start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST'))  
    args, _ = parser.parse_known_args()

    input_path = os.path.join(args.train, 'dataset.pkl.gz')
    with gzip.open(input_path, 'rb') as f:
        train_data, train_label, test_data, test_label = pickle.load(f)
        
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(train_data, train_label, epochs=3, verbose=2)
    model.evaluate(test_data, test_label, verbose=0)
    
    model.save(os.path.join(args.sm_model_dir, '000000001'), 'my_model.h5')
        
    print("training time: {}".format(time.time() - start))

In [None]:
%%writefile mnist_simple_nn.py
import tensorflow as tf
import argparse
import os, time
import numpy as np
import json
import gzip, pickle

if __name__ == "__main__":
    
    start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST'))  
    args, _ = parser.parse_known_args()

    input_path = os.path.join(args.train, 'dataset.pkl.gz')
    with gzip.open(input_path, 'rb') as f:
        train_data, train_label, test_data, test_label = pickle.load(f)
        
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(train_data, train_label, epochs=3, verbose=2)
    model.evaluate(test_data, test_label, verbose=0)
    
    model.save(os.path.join(args.sm_model_dir, '000000001'), 'my_model.h5')
        
    print("training time: {}".format(time.time() - start))

## Trial creation & run

In [None]:
trial_name = f"simple-nn-64-{int(time.time())}"
trial = Trial.create(trial_name=trial_name, 
                     experiment_name=my_experiment.experiment_name)

In [None]:
estimator = TensorFlow(entry_point='mnist_simple_nn.py',
                       role=role,
                       train_instance_count=1,
                       train_instance_type='ml.m5.xlarge',
                       train_use_spot_instances = True,
                       train_max_run = 600,
                       train_max_wait = 1200,                     
                       framework_version='2.1.0',
                       py_version='py3')

In [None]:
estimator.fit(inputs=input_uri,
              job_name=trial_name,
              experiment_config={
                  "TrialName": trial.trial_name,
                  "TrialComponentDisplayName": "Training"
              })

## Evaluate trials

In [None]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

In [None]:
from sagemaker.analytics import ExperimentAnalytics
#experiment_name = "my-private-exp3"
trial_component_analytics = ExperimentAnalytics(
    experiment_name=my_experiment.experiment_name,
    #experiment_name=experiment_name,
    search_expression=search_expression,
    sort_by="metrics.accuracy_EVAL.max",
    sort_order="Descending",
    metric_names=['accuracy_EVAL', 'loss_EVAL'],
    parameter_names=['SageMaker.InstanceType', 'sagemaker_program']
)

In [None]:
trial_component_analytics.dataframe()

# Debugger

In [None]:
! pip install smdebug

In [None]:
from smdebug.trials import create_trial
debug_trial = create_trial(estimator.latest_job_debugger_artifacts_path())

In [None]:
debug_trial.tensor_names()

In [None]:
debug_trial.tensor_names(collection="losses")

In [None]:
import matplotlib.pyplot as plt
#import re

plt.figure(
    num=1, figsize=(8, 8), dpi=80,
    facecolor='w', edgecolor='k')

tensor = debug_trial.tensor('loss')
steps = tensor.steps()
data = [tensor.value(s) for s in steps]

plt.plot(steps, data, label='Loss')

plt.legend(bbox_to_anchor=(1.04,1), loc='upper left')
plt.xlabel('Iteration')
plt.ylabel('')
plt.show()

# Hyperparameter trial loop

In [None]:
%%writefile mnist_simple_nn_h.py
import tensorflow as tf
import argparse
import os, time
import numpy as np
import json
import gzip, pickle

if __name__ == "__main__":
    
    start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST'))
    parser.add_argument('--number-of-nodes', type=int, default=128) # parameterize
    args, _ = parser.parse_known_args()

    input_path = os.path.join(args.train, 'dataset.pkl.gz')
    with gzip.open(input_path, 'rb') as f:
        train_data, train_label, test_data, test_label = pickle.load(f)
        
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(args.number_of_nodes, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(train_data, train_label, epochs=3, verbose=2)
    model.evaluate(test_data, test_label, verbose=0)
    
    model.save(os.path.join(args.sm_model_dir, '000000001'), 'my_model.h5')
        
    print("training time: {}".format(time.time() - start))

In [None]:
for _, number_of_nodes in enumerate([32, 128]):
    trial_name = f"simple-nn-{number_of_nodes}-{int(time.time())}"
    trial = Trial.create(trial_name=trial_name, 
                         experiment_name=my_experiment.experiment_name)
    
    estimator = TensorFlow(entry_point='mnist_simple_nn_h.py',
                           role=role,
                           train_instance_count=1,
                           train_instance_type='ml.m5.xlarge',
                           hyperparameters={
                               'number_of_nodes': number_of_nodes
                           },
                           train_use_spot_instances = True,
                           train_max_run = 600,
                           train_max_wait = 1200,
                           framework_version='2.1.0',
                           py_version='py3')
                           
    estimator.fit(inputs=input_uri,
                  job_name=trial_name,
                  experiment_config={
                      "TrialName": trial.trial_name,
                      "TrialComponentDisplayName": "Training"
                  })
    time.sleep(2)

In [None]:
trial_analytics2 = ExperimentAnalytics(
    experiment_name=my_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.accuracy_EVAL.max",
    sort_order="Descending",
    metric_names=['accuracy_EVAL', 'loss_EVAL'],
    parameter_names=['SageMaker.InstanceType', 'sagemaker_program']
)
trial_analytics2.dataframe()

# Hyperparameter optimization

In [None]:
hpo_estimator = TensorFlow(entry_point='mnist_simple_nn_h.py',
                           role=role,
                           train_instance_count=1,
                           train_instance_type='ml.m5.xlarge',
                           train_use_spot_instances = True,
                           train_max_run = 600, 
                           train_max_wait = 1200,
                           framework_version='2.1.0',
                           py_version='py3')

In [None]:
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

tuner = HyperparameterTuner(estimator=hpo_estimator,
                            objective_metric_name='loss',
                            objective_type='Minimize',
                            hyperparameter_ranges={
                                'number_of_nodes': IntegerParameter(32, 128)
                            },
                            metric_definitions=[{
                                'Name': 'loss',
                                'Regex': 'loss: ([0-9\\.]+)'
                            }],
                            max_jobs=10,
                            max_parallel_jobs=2, # What number is the best? 
                            early_stopping_type='Auto')

In [None]:
import time

timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
hpo_job_name='hpo-job-{}'.format(timestamp)
tuner.fit(inputs=input_uri, job_name=hpo_job_name)

## HPO Results

In [None]:
import boto3
sm_client = boto3.client('sagemaker')
#hpo_job_name='hpo-job-2020-07-01-05-41-03'
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = sm_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=hpo_job_name)

status = tuning_job_result['HyperParameterTuningJobStatus']
if status != 'Completed':
    print('Reminder: the tuning job has not been completed.')
    
job_count = tuning_job_result['TrainingJobStatusCounters']['Completed']
print("%d training jobs have completed" % job_count)

is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')
objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['MetricName']

In [None]:
import pandas as pd
from sagemaker import HyperparameterTuningJobAnalytics

tuner_result = HyperparameterTuningJobAnalytics(hpo_job_name)

full_df = tuner_result.dataframe()

if len(full_df) > 0:
    df = full_df[full_df['FinalObjectiveValue'] > -float('inf')]
    if len(df) > 0:
        df = df.sort_values('FinalObjectiveValue', ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest":min(df['FinalObjectiveValue']),"highest": max(df['FinalObjectiveValue'])})
        pd.set_option('display.max_colwidth', -1)  # Don't truncate TrainingJobName        
    else:
        print("No training jobs have reported valid results yet.")
        
df

In [None]:
# This is for showing the HPO results by time
import bokeh
import bokeh.io
bokeh.io.output_notebook()
from bokeh.plotting import figure, show
from bokeh.models import HoverTool

class HoverHelper():

    def __init__(self, tuning_analytics):
        self.tuner = tuning_analytics

    def hovertool(self):
        tooltips = [
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
            ("TrainingJobName", "@TrainingJobName"),
        ]
        for k in self.tuner.tuning_ranges.keys():
            tooltips.append( (k, "@{%s}" % k) )

        ht = HoverTool(tooltips=tooltips)
        return ht

    def tools(self, standard_tools='pan,crosshair,wheel_zoom,zoom_in,zoom_out,undo,reset'):
        return [self.hovertool(), standard_tools]

hover = HoverHelper(tuner_result)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type='datetime')
p.circle(source=df, x='TrainingStartTime', y='FinalObjectiveValue')
show(p)

In [None]:
ranges = tuner_result.tuning_ranges
figures = []
for hp_name, hp_range in ranges.items():
    categorical_args = {}
    if hp_range.get('Values'):
        # This is marked as categorical.  Check if all options are actually numbers.
        def is_num(x):
            try:
                float(x)
                return 1
            except:
                return 0           
        vals = hp_range['Values']
        if sum([is_num(x) for x in vals]) == len(vals):
            # Bokeh has issues plotting a "categorical" range that's actually numeric, so plot as numeric
            print("Hyperparameter %s is tuned as categorical, but all values are numeric" % hp_name)
        else:
            # Set up extra options for plotting categoricals.  A bit tricky when they're actually numbers.
            categorical_args['x_range'] = vals

    # Now plot it
    p = figure(plot_width=500, plot_height=500, 
               title="Objective vs %s" % hp_name,
               tools=hover.tools(),
               x_axis_label=hp_name, y_axis_label=objective_name,
               **categorical_args)
    p.circle(source=df, x=hp_name, y='FinalObjectiveValue')
    figures.append(p)
show(bokeh.layouts.Column(*figures))

# Total Experiments Result

In [None]:
experiment_name="our-experiment-3"

In [None]:
final_estimator = TensorFlow(entry_point='mnist_simple_nn_h.py',
                             role=role,
                             train_instance_count=1,
                             train_instance_type='ml.m5.xlarge',
                             hyperparameters={
                                 'number_of_nodes': 83
                             },
                             metric_definitions=[
                                 {'Name': 'Training:seconds', 'Regex': 'training time: ([0-9\\.]+)'}
                             ],
                             train_use_spot_instances = True,
                             train_max_run = 600,
                             train_max_wait = 1200,
                             framework_version='2.1.0',
                             py_version='py3')

In [None]:
my_trial_name = f"simple-nn-83-{int(time.time())}"
my_trial = Trial.create(trial_name=my_trial_name, 
                        experiment_name=experiment_name)
final_estimator.fit(inputs=input_uri,
                    job_name=my_trial_name,
                    experiment_config={
                        "TrialName": my_trial.trial_name,
                        "TrialComponentDisplayName": "Training"
                    })

In [None]:
all_trial_analytics = ExperimentAnalytics(
    experiment_name=experiment_name,
    search_expression=search_expression,
    sort_by="metrics.accuracy_EVAL.max",
    sort_order="Descending",
    metric_names=['accuracy_EVAL', 'loss_EVAL', 'Training:seconds'],
    parameter_names=['SageMaker.InstanceType', 'sagemaker_program']
)
all_trial_analytics.dataframe()