In [2]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train MNIST with SageMaker Cloud/Remote Execution

This notebook demonstrates how to train a Tensorflow model using the AWS Sagemaker framework and deploy that model as an endpoint for inference. 

## Overview
This example uses SageMaker's 'script mode' added after TensorFlow 1.11 that makes training a model on SageMaker very similar to running a script locally, or on Google Cloud AI Platform.

The Tensorflow model you will be running in this example is located in training_job/task.py and training_job/model.py. The file task.py contains the driver loop for training the model.  The file model.py contains functions that define the model and the input function needed for reading the data. Without any changes you should be able to run task.py from the command line as a script.

This notebook shows you how to run that same script locally using the TensorFlow Sagemaker Python SDK.

In [1]:
from sagemaker.tensorflow import TensorFlow
import numpy as np
import sagemaker

## Cloud Model Training

In [7]:
# The sagemaker.get_execution_role() function will identify the SageMaker role assigned to the SageMaker notebook
# instance. This role has the necessary permissions required to run the training process.
role = sagemaker.get_execution_role()

# The hyperparameters dictionary specified deploy contains parameters and hyperparamemters  that will be 
# sent to the command line of the script when the model is fit.
hyperparameters={'steps':12000, # 600 steps per epoch, 20 epochs
                  'batch-size':100,
                  'learning-rate':0.001,
                  'verbosity':'INFO'} 


# Here you are using the sagemaker.TensorFlow Python SDK to create a SageMaker estimator. 
#
# Note: This example uses a CPU only instance ml.m4.xlarge. If you wish to use a GPU Instance 
# such as 'ml.p2.xlarge' you may need to request a quota increase
# via http://aws.amazon.com/contact-us/ec2-request

tf_estimator = TensorFlow(py_version='py3', 
                          framework_version='1.12', 
                          entry_point='task.py',
                          role=role,
                          train_instance_count=1,
                          train_instance_type='ml.m4.xlarge',
                          hyperparameters=hyperparameters,
                          source_dir='trainer/')

In [None]:
# Once the SageMaker estimator is created .fit() can be called on it.  The value for the key 'train' is set
# to the environmet variable SM_CHANNEL_TRAIN and the key for 'eval' is set to the environment variable
# SM_CHANNEL_EVAL. Your SageMaker script must implement these locations to find the model training and 
# eval data.

# Calling fit.() in local mode will cause SageMaker to start a docker container on your machine 
# that contains the SageMaker TensorFlow training image. 

tf_estimator.fit({'train':'s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/train',
                  'eval':'s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/test'})

## Remote Endpoint Deployment

Now that your model has been trained it can be deployed by calling .deploy() on the same Estimator instance we created above. The instance_type is set to the AWS machine type you would like to deploy to.

In [5]:
mnist_predictor = tf_estimator.deploy(initial_instance_count=1,
                                      instance_type='ml.t2.medium')

--------------------------------------------------------------------------------------!

In [6]:
from keras.datasets import mnist
def load_mnist_data():   
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = np.reshape(x_train, [-1, 28,28,1]).astype(np.float32)
    x_test = np.reshape(x_test, [-1, 28,28,1]).astype(np.float32)
    x_train /= 255
    x_test /= 255
    train_data = {'images':x_train, 'labels':y_train}
    test_data = {'images':x_test, 'labels':y_test}
    return train_data, test_data

train_data, test_data = load_mnist_data()

for ex in range(1,10):
    # load an example from the test set
    example = test_data['images'][ex].reshape(1,28,28,1)
    #predictions is a dict{'predictions'[[]]}
    predictions = mnist_predictor.predict(example)
    #predictions['predictions'][0] contains the softmax activations of the network
    predicted_label = np.argmax(predictions['predictions'][0])
    label = test_data['labels'][ex]
    print("Example {}: Predicted label: {}  Actual label:{}".format(ex, predicted_label, label))

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
Example 1: Predicted label: 2  Actual label:2
Example 2: Predicted label: 1  Actual label:1
Example 3: Predicted label: 0  Actual label:0
Example 4: Predicted label: 4  Actual label:4
Example 5: Predicted label: 1  Actual label:1
Example 6: Predicted label: 4  Actual label:4
Example 7: Predicted label: 9  Actual label:9
Example 8: Predicted label: 5  Actual label:5
Example 9: Predicted label: 9  Actual label:9
