In [1]:

import os
import numpy as np
from tensorflow.keras.datasets import fashion_mnist

(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data", exist_ok = True)
np.savez('./data/training', image=x_train, label=y_train)
np.savez('./data/validation', image=x_val, label=y_val)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [2]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'keras2-fashion-mnist'

training_input_path = sess.upload_data('data/training.npz', key_prefix=prefix+'/training')
validation_input_path = sess.upload_data('data/validation.npz', key_prefix=prefix+'/validation')
output_path = 's3://{}/{}/output/'.format(bucket, prefix)
chk_path = 's3://{}/{}/checkpoints/'.format(bucket, prefix)

print(training_input_path)
print(validation_input_path)
print(output_path)
print(chk_path)

2.42.1
s3://sagemaker-us-east-1-886035371869/keras2-fashion-mnist/training/training.npz
s3://sagemaker-us-east-1-886035371869/keras2-fashion-mnist/validation/validation.npz
s3://sagemaker-us-east-1-886035371869/keras2-fashion-mnist/output/
s3://sagemaker-us-east-1-886035371869/keras2-fashion-mnist/checkpoints/


In [3]:
bucket

'sagemaker-us-east-1-886035371869'

In [8]:
role

'arn:aws:iam::886035371869:role/torchserve-workshop-SageMakerAPIExecutionRole'

In [5]:
from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(entry_point='fmnist3.py',
                          role=role,
                          instance_count=1, 
                          instance_type='ml.p3.2xlarge',
                          framework_version='2.1.0', 
                          py_version='py3',
                          hyperparameters={'epochs': 2},
                          output_path=output_path,
                          use_spot_instances=True,
                          max_run=3600,                    
                          max_wait=7200)

In [6]:
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [
    {'Name': 'val_acc', 'Regex': 'Best val_accuracy: ([0-9\\.]+)'}
]

In [7]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter

hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.001, 0.2, scaling_type='Logarithmic'), 
    'batch-size': IntegerParameter(32,512)
}

In [8]:
from sagemaker.tuner import HyperparameterTuner

tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions=metric_definitions,
                            objective_type=objective_type,
                            max_jobs=4,
                            max_parallel_jobs=2,
                            early_stopping_type='Auto')

In [9]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})


..........................................................................................................................................................................................................!


In [10]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name=tuner.latest_tuning_job.name)

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,batch-size,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,87.0,0.120563,tensorflow-training-210615-0118-003-358756ff,Completed,0.8944,2021-06-15 01:32:15+00:00,2021-06-15 01:34:43+00:00,148.0
0,88.0,0.2,tensorflow-training-210615-0118-004-4bcd244a,Completed,0.8868,2021-06-15 01:29:04+00:00,2021-06-15 01:31:32+00:00,148.0
2,264.0,0.194908,tensorflow-training-210615-0118-002-1d6ac9b5,Completed,0.7083,2021-06-15 01:21:40+00:00,2021-06-15 01:24:09+00:00,149.0
3,490.0,0.002266,tensorflow-training-210615-0118-001-a952361a,Completed,0.3181,2021-06-15 01:22:12+00:00,2021-06-15 01:24:04+00:00,112.0


In [11]:

import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tuner.deploy(
                 initial_instance_count=1, 
                 instance_type='ml.m5.large',
                 endpoint_name=tf_endpoint_name)


2021-06-13 15:16:22 Starting - Preparing the instances for training
2021-06-13 15:16:22 Downloading - Downloading input data
2021-06-13 15:16:22 Training - Training image download completed. Training in progress.
2021-06-13 15:16:22 Uploading - Uploading generated training model
2021-06-13 15:16:22 Completed - Training job completed

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.



-------------!