# Building CPU and GPU containers for Keras-MXNet on Amazon SageMaker

In [109]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')



In [110]:
container

'475088953585.dkr.ecr.ap-southeast-1.amazonaws.com/xgboost:1'

In [111]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name

## Prepare files required to build the containers

In [112]:
!cat Dockerfile.cpu

FROM ubuntu:16.04

RUN apt-get update && \
    apt-get -y install build-essential libopencv-dev libopenblas-dev libjemalloc-dev libgfortran3 \
    python-dev python3-dev python3-pip wget curl

COPY mnist_cnn.py /opt/program/train
RUN chmod +x /opt/program/train

RUN mkdir /root/.keras
COPY keras.json /root/.keras/

RUN pip3 install mxnet --upgrade --pre && \
    pip3 install keras-mxnet --upgrade --pre

RUN rm -rf /var/lib/apt/lists/*
RUN rm -rf /root/.cache

ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"

ENV PATH="/opt/program:${PATH}"

WORKDIR /opt/program

In [113]:
!cat Dockerfile.gpu

FROM nvidia/cuda:9.0-runtime

RUN apt-get update && \
    apt-get -y install build-essential libopencv-dev libopenblas-dev libjemalloc-dev libgfortran3 \
    python-dev python3-dev python3-pip wget curl

COPY mnist_cnn.py /opt/program/train
RUN chmod +x /opt/program/train

RUN mkdir /root/.keras
COPY keras.json /root/.keras/

RUN pip3 install mxnet-cu90 --upgrade --pre && \
    pip3 install keras-mxnet --upgrade --pre

RUN rm -rf /var/lib/apt/lists/*
RUN rm -rf /root/.cache

ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"

ENV PATH="/opt/program:${PATH}"

WORKDIR /opt/program

In [114]:
mkdir build

mkdir: cannot create directory ‘build’: File exists


In [115]:
# Copy Dockerfiles
!cp Dockerfile.* build/

In [116]:
# Copy training script and config file
!cp mnist_cnn.py build/
!cp keras.json build/

## Create and login to a repository in ECR

### CPU settings

In [117]:
repo_name = 'keras-mxnet-cpu' # ECR repository
image_tag = 'keras-mxnet1.2.0-cpu-py3' # ECR image tag
base_job_name = 'keras-mxnet-mnist-cnn' # SageMaker training prefix

%env dockerfile Dockerfile.cpu

train_instance_type='ml.c5.9xlarge'
gpu_count=0
batch_size=128

env: dockerfile=Dockerfile.cpu


### GPU settings

In [118]:
repo_name = 'keras-mxnet-gpu' # ECR repository
image_tag = 'keras-mxnet1.2.0-gpu-py3' # ECR image tag
base_job_name = 'keras-mxnet-mnist-cnn' # SageMaker training prefix

%env dockerfile Dockerfile.gpu

train_instance_type='ml.p3.8xlarge'
gpu_count=2
batch_size=256

env: dockerfile=Dockerfile.gpu


### Common settings

In [119]:
%env account {account}
%env region {region}
%env repo_name {repo_name}
%env image_tag {image_tag}

env: account=349934754982
env: region=ap-southeast-1
env: repo_name=keras-mxnet-gpu
env: image_tag=keras-mxnet1.2.0-gpu-py3


### Create repository and login

In [120]:
%%sh

aws ecr describe-repositories --repository-names $repo_name > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name $repo_name > /dev/null
fi

$(aws ecr get-login --region $region --no-include-email)

Login Succeeded


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



## Build and tag Docker image

In [121]:
%cd build
!docker build -t $image_tag -f $dockerfile .
%cd ..    

/home/ec2-user/SageMaker/dlnotebooks/keras/01-custom-container/build
Sending build context to Docker daemon  11.78kB
Step 1/12 : FROM nvidia/cuda:9.0-runtime
 ---> ed2bb7e1254e
Step 2/12 : RUN apt-get update &&     apt-get -y install build-essential libopencv-dev libopenblas-dev libjemalloc-dev libgfortran3     python-dev python3-dev python3-pip wget curl
 ---> Using cache
 ---> 5f62e5baa4e4
Step 3/12 : COPY mnist_cnn.py /opt/program/train
 ---> fc6bd3e6c937
Step 4/12 : RUN chmod +x /opt/program/train
 ---> Running in af0c21b783c4
Removing intermediate container af0c21b783c4
 ---> c7880012620f
Step 5/12 : RUN mkdir /root/.keras
 ---> Running in a4c183cd1b15
Removing intermediate container a4c183cd1b15
 ---> dcdb047a66dc
Step 6/12 : COPY keras.json /root/.keras/
 ---> 58a9aee20ca7
Step 7/12 : RUN pip3 install mxnet-cu90 --upgrade --pre &&     pip3 install keras-mxnet --upgrade --pre
 ---> Running in b3ec029776b7
Collecting mxnet-cu90
  Downloading https://files.pythonhosted.org/packages

In [122]:
!docker tag $image_tag $account.dkr.ecr.$region.amazonaws.com/$repo_name:latest

In [123]:
!docker images

REPOSITORY                                                                       TAG                 IMAGE ID            CREATED             SIZE
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu                latest              b8c27305a445        1 second ago        3.23GB
keras-mxnet1.2.0-gpu-py3                                                         latest              b8c27305a445        1 second ago        3.23GB
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu                <none>              18dd6b65edb0        28 minutes ago      3.23GB
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu                <none>              227bf52b892d        44 minutes ago      3.23GB
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu                <none>              e12bbebc012e        About an hour ago   3.23GB
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-tf-cifar10-example   latest              0a41375

In [101]:
# It's probably a good idea to inspect your container before pushing it :)
# !docker -it /bin/bash $CONTAINER

## Push Docker image to ECR

In [124]:
!docker push $account.dkr.ecr.$region.amazonaws.com/$repo_name:latest

The push refers to repository [349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu]

[1Ba5a44257: Preparing 
[1B935a6123: Preparing 
[1Bdf01a092: Preparing 
[1Beaca2436: Preparing 
[1Bb67622ce: Preparing 
[1B8ff508f2: Preparing 
[1B05ec5dbd: Preparing 
[1B2669ceef: Preparing 
[1B49db96b8: Preparing 
[1B6bc0e30f: Preparing 
[1B7f750489: Preparing 
[1Beddd58ba: Preparing 
[1Ba0c9a8cd: Preparing 
[1B91ae09b8: Preparing 
[1B8b4c3da7: Preparing 
[14Bf01a092: Pushed   1.439GB/1.435GB[16A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[13A[1K[K[14A[1K[K[11A[1K[K[9A[1K[K[8A[1K[K[5A[1K[K[14A[1K[K[2A[1K[K[14A[1K[K[11A[1K[K[14A[1K[K[10A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K[14A[1K[K

## Upload MNIST data to S3

In [103]:
local_directory = 'data'
prefix          = repo_name+'/input'

train_input_path      = sess.upload_data(local_directory+'/train/',      key_prefix=prefix+'/train')
validation_input_path = sess.upload_data(local_directory+'/validation/', key_prefix=prefix+'/validation')

## Train with the custom container

In [125]:
output_path = 's3://{}/{}/output'.format(sess.default_bucket(), repo_name)
image_name  = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, repo_name)

print(output_path)
print(image_name)

estimator = sagemaker.estimator.Estimator(
                       image_name=image_name,
                       base_job_name=base_job_name,
                       role=role, 
                       train_instance_count=1, 
                       train_instance_type=train_instance_type,
                       output_path=output_path,
                       sagemaker_session=sess)

estimator.set_hyperparameters(lr=0.01, epochs=10, gpus=gpu_count, batch_size=batch_size)

estimator.fit({'training': train_input_path, 'validation': validation_input_path})

INFO:sagemaker:Creating training-job with name: keras-mxnet-mnist-cnn-2019-03-13-00-39-33-681


s3://sagemaker-ap-southeast-1-349934754982/keras-mxnet-gpu/output
349934754982.dkr.ecr.ap-southeast-1.amazonaws.com/keras-mxnet-gpu:latest
2019-03-13 00:39:33 Starting - Starting the training job...
2019-03-13 00:39:35 Starting - Launching requested ML instances.........
2019-03-13 00:41:07 Starting - Preparing the instances for training...
2019-03-13 00:42:01 Downloading - Downloading input data...
2019-03-13 00:42:08 Training - Downloading the training image...
2019-03-13 00:43:02 Training - Training image download completed. Training in progress..
[31mUsing MXNet backend[0m
[31mHyper parameters: {'batch_size': '256', 'gpus': '2', 'epochs': '10', 'lr': '0.01'}[0m
[31mInput parameters: {'validation': {'TrainingInputMode': 'File', 'RecordWrapperType': 'None', 'S3DistributionType': 'FullyReplicated'}, 'training': {'TrainingInputMode': 'File', 'RecordWrapperType': 'None', 'S3DistributionType': 'FullyReplicated'}}[0m
[31mFiles loaded[0m
[31mx_train shape: (60000, 1, 28, 28)[0m



2019-03-13 00:43:34 Uploading - Uploading generated training model
2019-03-13 00:43:34 Completed - Training job completed
[31m11520/60000 [====>.........................] - ETA: 0s - loss: 0.0367 - acc: 0.9874[0m
[31mEpoch 9/10
[0m
[31m  256/60000 [..............................] - ETA: 1s - loss: 0.0174 - acc: 1.0000
 3072/60000 [>.............................] - ETA: 1s - loss: 0.0386 - acc: 0.9860
 5888/60000 [=>............................] - ETA: 1s - loss: 0.0338 - acc: 0.9886
 8704/60000 [===>..........................] - ETA: 0s - loss: 0.0327 - acc: 0.9889[0m
[31m11520/60000 [====>.........................] - ETA: 0s - loss: 0.0323 - acc: 0.9891[0m
[31mEpoch 10/10

  256/60000 [..............................] - ETA: 1s - loss: 0.0362 - acc: 0.9922
 3072/60000 [>.............................] - ETA: 1s - loss: 0.0259 - acc: 0.9912
 5888/60000 [=>............................] - ETA: 1s - loss: 0.0280 - acc: 0.9908
 8704/60000 [===>..........................] - ETA: 0s 

In [126]:
model_path=output_path+'/keras-mxnet-mnist-cnn-2019-03-13-00-10-16-492/output/model.tar.gz'

In [None]:
!aws cp model_path .

In [107]:
from sagemaker.mxnet import MXNet, MXNetModel

sagemaker_model = MXNetModel(model_data = model_path,
                             role = role,
                             entry_point = 'default_classifier.py',
                             py_version='py3')

In [108]:
predictor = sagemaker_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-mxnet-2019-03-13-00-18-47-286
INFO:sagemaker:Creating endpoint with name sagemaker-mxnet-2019-03-13-00-18-47-286


---------------------------------------------------------------------------------------------------------------------------*

ValueError: Error hosting endpoint sagemaker-mxnet-2019-03-13-00-18-47-286: Failed Reason:  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.