In [1]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train MNIST with SageMaker Local Execution

This notebook demonstrates how to train a Tensorflow model using the AWS Sagemaker framework in 'local' mode and deploy that model as an endpoint for inference. 

## Overview
This example uses SageMaker's 'script mode' added after TensorFlow 1.11 that makes training a model on SageMaker very similar to running a script locally, or on Google Cloud AI Platform.

The Tensorflow model you will be running in this example is located in training_job/task.py and training_job/model.py. The file task.py contains the driver loop for training the model.  The file model.py contains functions that define the model and the input function needed for reading the data. Without any changes you should be able to run task.py from the command line as a script.

This notebook shows you how to run that same script locally using the TensorFlow Sagemaker Python SDK.



In [2]:
from sagemaker.tensorflow import TensorFlow  # Allows sagemaker to train using the TensorFlow estimator API.
import numpy as np

## Local Model Training

In [3]:
# STOP! IMPORTANT!
# Sagemaker requires the user to define a role in IAM for Sagemaker Execution. 
# You should create this role in IAM and then set role = to the role identity.
# For example:
# role='arn:aws:iam::XXXXXXXXXXXX:role/sagemaker-local'
# I'm using the below import instead to keep you from accidentally using my role, which won't work for you.
# So delete these two lines and replace them with the role= line above.
import sagemaker_role
role = sagemaker_role.role

# The hyperparameters dictionary specified deploy contains parameters and hyperparamemters  that will be 
# sent to the command line of the script when the model is fit.
hyperparameters={'steps':12000, # 600 steps per epoch, 20 epochs
                  'batch-size':100,
                  'learning-rate':0.001,
                  'verbosity':'INFO'} 

# Here you are using the sagemaker.TensorFlow Python SDK to create a SageMaker estimator. 
tf_estimator = TensorFlow(py_version='py3', 
                          framework_version='1.12', 
                          entry_point='task.py',
                          role=role,
                          train_instance_count=1,
                          train_instance_type='local',
                          hyperparameters=hyperparameters,
                          source_dir='trainer/')

In [4]:
# Once the SageMaker estimator is created .fit() can be called on it.  The value for the key 'train' is set
# to the environmet variable SM_CHANNEL_TRAIN and the key for 'eval' is set to the environment variable
# SM_CHANNEL_EVAL. Your SageMaker script must implement these locations to find the model training and 
# eval data.

# Calling fit.() in local mode will cause SageMaker to start a docker container on your machine 
# that contains the SageMaker TensorFlow training image. 

tf_estimator.fit({'train':'s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/train',
                  'eval':'s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/test'})



Creating tmp_f_209z7_algo-1-us3ds_1 ... 
[1BAttaching to tmp_f_209z7_algo-1-us3ds_12mdone[0m
[36malgo-1-us3ds_1  |[0m 2019-07-16 20:38:48,791 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
[36malgo-1-us3ds_1  |[0m 2019-07-16 20:38:48,805 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-us3ds_1  |[0m 2019-07-16 20:38:49,168 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-us3ds_1  |[0m 2019-07-16 20:38:49,196 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-us3ds_1  |[0m 2019-07-16 20:38:49,210 sagemaker-containers INFO     Invoking user script
[36malgo-1-us3ds_1  |[0m 
[36malgo-1-us3ds_1  |[0m Training Env:
[36malgo-1-us3ds_1  |[0m 
[36malgo-1-us3ds_1  |[0m {
[36malgo-1-us3ds_1  |[0m     "additional_framework_parameters": {},
[36malgo-1-us3ds_1  |[0m     "channel_input_dirs": {
[36malgo-1-us3ds_1  |[0m  

[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Using config: {'_model_dir': 's3://sagemaker-us-east-2-708267171719/sagemaker-tensorflow-scriptmode-2019-07-16-20-38-37-418/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 6000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
[36malgo-1-us3ds_1  |[0m graph_options {
[36malgo-1-us3ds_1  |[0m   rewrite_options {
[36malgo-1-us3ds_1  |[0m     meta_optimizer_iterations: ONE
[36malgo-1-us3ds_1  |[0m   }
[36malgo-1-us3ds_1  |[0m }
[36malgo-1-us3ds_1  |[0m , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb1eb900208>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '

[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.9568
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.104231246, step = 901 (3.853 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 26.2533
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.02981455, step = 1001 (3.809 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.7262
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.035081938, step = 1101 (3.887 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.7279
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.027441312, step = 1201 (3.887 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 26.5801
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.018025596, step = 1301 (3.762 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 26.1506
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.11797112, step = 1401 (3.824 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 27.0888
[36m

[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Graph was finalized.
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Restoring parameters from s3://sagemaker-us-east-2-708267171719/sagemaker-tensorflow-scriptmode-2019-07-16-20-38-37-418/model/model.ckpt-6000
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Running local_init_op.
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Done running local_init_op.
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [60/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [120/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [180/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [240/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [300/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [360/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [420/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [480/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:Evaluation [540/600]
[36malgo-1-us3ds_1  |[0m INFO:tensorf

[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 23.7819
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.0023001344, step = 9401 (4.212 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 24.7211
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.018700419, step = 9501 (4.040 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.1949
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.0073048444, step = 9601 (3.967 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 24.4175
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.00087744923, step = 9701 (4.095 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 27.216
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.008382603, step = 9801 (3.675 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.7021
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:loss = 0.0040357076, step = 9901 (3.890 sec)
[36malgo-1-us3ds_1  |[0m INFO:tensorflow:global_step/sec: 25.263

## Local Endpoint Deployment

Now that your model has been trained it can be deployed by calling .deploy() on the same Estimator instance we created above. The instance_type is set to local, which will create an endpoint container in a docker container on your local machine.

In [8]:

mnist_predictor = tf_estimator.deploy(initial_instance_count=1,
                                      instance_type='local')

W0716 15:48:31.788869 4464944576 connectionpool.py:662] Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x147f9e940>: Failed to establish a new connection: [Errno 61] Connection refused')': /ping
W0716 15:48:31.795378 4464944576 connectionpool.py:662] Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x147f9e668>: Failed to establish a new connection: [Errno 61] Connection refused')': /ping
W0716 15:48:31.797409 4464944576 connectionpool.py:662] Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x147f9e710>: Failed to establish a new connection: [Errno 61] Connection refused')': /ping


Attaching to tmpt3mxknya_algo-1-mjh6b_1
[36malgo-1-mjh6b_1  |[0m INFO:__main__:starting services
[36malgo-1-mjh6b_1  |[0m INFO:__main__:using default model name: model
[36malgo-1-mjh6b_1  |[0m INFO:__main__:tensorflow serving model config: 
[36malgo-1-mjh6b_1  |[0m model_config_list: {
[36malgo-1-mjh6b_1  |[0m   config: {
[36malgo-1-mjh6b_1  |[0m     name: "model",
[36malgo-1-mjh6b_1  |[0m     base_path: "/opt/ml/model",
[36malgo-1-mjh6b_1  |[0m     model_platform: "tensorflow"
[36malgo-1-mjh6b_1  |[0m   },
[36malgo-1-mjh6b_1  |[0m }
[36malgo-1-mjh6b_1  |[0m 
[36malgo-1-mjh6b_1  |[0m 
[36malgo-1-mjh6b_1  |[0m INFO:__main__:nginx config: 
[36malgo-1-mjh6b_1  |[0m load_module modules/ngx_http_js_module.so;
[36malgo-1-mjh6b_1  |[0m 
[36malgo-1-mjh6b_1  |[0m worker_processes auto;
[36malgo-1-mjh6b_1  |[0m daemon off;
[36malgo-1-mjh6b_1  |[0m pid /tmp/nginx.pid;
[36malgo-1-mjh6b_1  |[0m error_log  /dev/stderr info;
[36malgo-1-mjh6b_1  |[0m 
[36malgo-

![36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:36 +0000] "GET /ping HTTP/1.1" 200 0 "-" "-"


Lastly, you can test the deployed endpoint by passing MNIST data to it. The predictor's .predict() function will
take care of the REST calls for you.

In [9]:
from keras.datasets import mnist
def load_mnist_data():   
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = np.reshape(x_train, [-1, 28,28,1]).astype(np.float32)
    x_test = np.reshape(x_test, [-1, 28,28,1]).astype(np.float32)
    x_train /= 255
    x_test /= 255
    train_data = {'images':x_train, 'labels':y_train}
    test_data = {'images':x_test, 'labels':y_test}
    return train_data, test_data

train_data, test_data = load_mnist_data()

for ex in range(1,10):
    # load an example from the test set
    example = test_data['images'][ex].reshape(1,28,28,1)
    #predictions is a dict{'predictions'[[]]}
    predictions = mnist_predictor.predict(example)
    #predictions['predictions'][0] contains the softmax activations of the network
    predicted_label = np.argmax(predictions['predictions'][0])
    label = test_data['labels'][ex]
    print("Example {}: Predicted label: {}  Actual label:{}".format(ex, predicted_label, label))

Example 1: Predicted label: 2  Actual label:2
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 151 "-" "-"
Example 2: Predicted label: 1  Actual label:1
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 148 "-" "-"
Example 3: Predicted label: 0  Actual label:0
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 152 "-" "-"
Example 4: Predicted label: 4  Actual label:4
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 151 "-" "-"
Example 5: Predicted label: 1  Actual label:1
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 152 "-" "-"
Example 6: Predicted label: 4  Actual label:4
[36malgo-1-mjh6b_1  |[0m 172.18.0.1 - - [16/Jul/2019:20:48:41 +0000] "POST /invocations HTTP/1.1" 200 156 "-" "-"
Example 7: Predicted label: 