In [8]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload csv

In [4]:
bucket_name = 'arch-ml-bucket-us-east-1'
training_file_key = 'biketrain/bike_train.csv'
validation_file_key = 'biketrain/bike_validation.csv'
test_file_key = 'biketrain/bike_test.csv'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://arch-ml-bucket-us-east-1/biketrain/model
s3://arch-ml-bucket-us-east-1/biketrain/bike_train.csv
s3://arch-ml-bucket-us-east-1/biketrain/bike_validation.csv
s3://arch-ml-bucket-us-east-1/biketrain/bike_test.csv


In [4]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3('bike_train.csv', bucket_name,training_file_key)
write_to_s3('bike_validation.csv', bucket_name,validation_file_key)
write_to_s3('bike_test.csv', bucket_name,test_file_key)

## Get Training Algorithm Docker Image

In [10]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [11]:
role = get_execution_role()

In [12]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::497576017422:role/service-role/AmazonSageMaker-ExecutionRole-20190426T115051


## Build Model

In [9]:
sess = sagemaker.Session()

In [10]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-biketrain-v1')

In [13]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# Same as those in localmode
estimator.set_hyperparameters(max_depth=3,objective="reg:linear",
                              eta=0.3,subsample=1,num_round=100)

In [14]:
estimator.hyperparameters()

{'max_depth': 3,
 'objective': 'reg:linear',
 'eta': 0.3,
 'subsample': 1,
 'num_round': 100}

## Train

In [15]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

In [16]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://arch-ml-bucket-us-east-1/biketrain/bike_train.csv'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://arch-ml-bucket-us-east-1/biketrain/bike_validation.csv'}}, 'ContentType': 'csv'}


In [17]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':training_input_config, 'validation':validation_input_config})

INFO:sagemaker:Creating training-job with name: xgboost-biketrain-v1-2019-04-28-04-24-39-638


2019-04-28 04:24:39 Starting - Starting the training job...
2019-04-28 04:24:41 Starting - Launching requested ML instances......
2019-04-28 04:25:49 Starting - Preparing the instances for training......
2019-04-28 04:27:05 Downloading - Downloading input data
2019-04-28 04:27:05 Training - Downloading the training image..
[31mArguments: train[0m
[31m[2019-04-28:04:27:16:INFO] Running standalone xgboost training.[0m
[31m[2019-04-28:04:27:16:INFO] File size need to be processed in the node: 0.5mb. Available memory size in the node: 8405.0mb[0m
[31m[2019-04-28:04:27:16:INFO] Determined delimiter of CSV input is ','[0m
[31m[04:27:16] S3DistributionType set as FullyReplicated[0m
[31m[04:27:16] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-04-28:04:27:16:INFO] Determined delimiter of CSV input is ','[0m
[31m[04:27:16] S3DistributionType set as FullyReplicated[0m
[31m[04:27:16] 3266x13 matrix with 42


2019-04-28 04:27:29 Uploading - Uploading generated training model
2019-04-28 04:27:29 Completed - Training job completed
Billable seconds: 40


## Deploy Model

In [2]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
# predictor = estimator.deploy(initial_instance_count=1,
#                              instance_type='ml.t2.medium',
#                              endpoint_name = 'xgboost-biketrain-v1')

In [27]:
model = sagemaker.model.Model(model_data='https://s3.amazonaws.com/arch-ml-bucket-us-east-1/biketrain/model/xgboost-biketrain-v1-2019-04-28-04-24-39-638/output/model.tar.gz', 
                              image=containers[boto3.Session().region_name],
                             role=role)

In [28]:
model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

INFO:sagemaker:Creating model with name: xgboost-2019-04-28-14-17-01-673
INFO:sagemaker:Creating endpoint with name xgboost-2019-04-28-14-17-01-673


---------------------------------------------------------------------------------------------------!

## Run Predictions

In [30]:
endpoint_name = 'xgboost-2019-04-28-14-17-01-673'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [31]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [32]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

b'32.7826042175'