# SageMaker's XGBoost Built-in Algorithm on AWS

In [1]:
# Install SageMaker and boto if you don't already have them
#!pip install --upgrade sagemaker
#!pip install --upgrade boto3

## Remember these four steps:
1) Upload Train and Validation files to S3

2) Specify Algorithm and Hyperparameters

3) Configure type of server and number of servers to use for Training

4) Create a real-time Endpoint for interactive use case

## Import required libraries

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

In [2]:
sagemaker.__version__

'2.42.0'

## Upload Data to S3

In [3]:
# Make sure you specify your own bucket name
bucket_name = 'aws-ml-test-nsadawi'

training_folder = r'bikerental/training/'
validation_folder = r'bikerental/validation/'
test_folder = r'bikerental/test/'

s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [4]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://aws-ml-test-nsadawi/bikerental/model
s3://aws-ml-test-nsadawi/bikerental/training/
s3://aws-ml-test-nsadawi/bikerental/validation/
s3://aws-ml-test-nsadawi/bikerental/test/


In [5]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3

# File stored in S3 is automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [6]:
write_to_s3('../Data/bike_train.csv', 
            bucket_name,
            training_folder + 'bike_train.csv')

write_to_s3('../Data/bike_validation.csv',
            bucket_name,
            validation_folder + 'bike_validation.csv')

write_to_s3('../Data/bike_test.csv',
            bucket_name,
            test_folder + 'bike_test.csv')

## Training Algorithm Docker Image
### SageMaker maintains a separate image for algorithm and region
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

In [7]:
# Establish a session with AWS
sess = sagemaker.Session()

#### Important to use an IAM Role
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html

In [8]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    #arn:aws:iam::479320215787:role/service-role/AmazonSageMaker-ExecutionRole-20210306T134306
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20210306T134306')['Role']['Arn']

Couldn't call 'get_role' to get Role ARN from role name noureddin to get Role path.


In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::479320215787:role/service-role/AmazonSageMaker-ExecutionRole-20210306T134306


In [10]:
# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html#sagemaker.image_uris.retrieve

# SDK 2 uses image_uris.retrieve the container image location

# Use XGBoost 1.2 version 
container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name,version="1.2-1")

print (f'Using XGBoost Container {container}')

Using XGBoost Container 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1


## Build Model

In [11]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts need to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

# SDK 2.x version does not require train prefix for instance count and type
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name = 'xgboost-bikerental-trainingjob')

In [12]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference
#  https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters

# TODO: objective xgboost has deprecated reg:linear. use reg:squarederror instead
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:squarederror",
                              eta=0.1,
                              num_round=150)

In [13]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'reg:squarederror', 'eta': 0.1, 'num_round': 150}

### Specify Training Data Location and Optionally, Validation Data Location

In [14]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [15]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://aws-ml-test-nsadawi/bikerental/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://aws-ml-test-nsadawi/bikerental/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model (takes a few minutes)

In [16]:
%%time
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit(data_channels)

2022-05-23 10:39:07 Starting - Starting the training job...
2022-05-23 10:39:34 Starting - Preparing the instances for trainingProfilerReport-1653302346: InProgress
.........
2022-05-23 10:41:26 Downloading - Downloading input data...
2022-05-23 10:42:07 Training - Downloading the training image......
2022-05-23 10:43:14 Training - Training image download completed. Training in progress..[34m[2022-05-23 10:43:18.555 ip-10-2-139-106.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Det


2022-05-23 10:43:47 Uploading - Uploading generated training model
2022-05-23 10:43:47 Completed - Training job completed
Training seconds: 142
Billable seconds: 142
CPU times: user 1.1 s, sys: 134 ms, total: 1.23 s
Wall time: 5min 8s


## Deploy Model (takes a few minutes)

In [17]:
%%time
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'xgboost-may22')

-------!CPU times: user 293 ms, sys: 46.3 ms, total: 340 ms
Wall time: 3min 38s


## Make Predictions

In [18]:
# SDK 2.0 serializers
from sagemaker.serializers import CSVSerializer

In [19]:
predictor.serializer = CSVSerializer()

In [20]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

b'3.7716636657714844'

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Make Predictions

## What if the Endpoint is Already Up and Running?