In [None]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role

In [None]:
# get sagemaker-execution role
# https://github.com/aws/sagemaker-python-sdk/issues/300
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html
def resolve_sm_role():
    client = boto3.client('iam', region_name=boto3.Session().region_name)
    response_roles = client.list_roles(
        PathPrefix='/',
        MaxItems=123
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            return role['Arn']
    raise Exception('need to create sagemaker execution role from aws console first')
    return

try:
    execution_role_ARN = sagemaker.get_execution_role()
except ValueError:
    execution_role_ARN = resolve_sm_role()

execution_role_ARN

In [None]:
# Store the current SageMaker session
session = sagemaker.Session()
# Define the global bucket name
bucket = "data14group1-ml"

### Prepare the container and parameters for model training

In [None]:
# Retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
region = session.boto_region_name
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

region

In [None]:
# Initialize hyperparameters
hyperparameters = {
        "objective":'binary:logistic',
        "max_depth":"5",
        "eta":"0.1",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.8",
        "early_stopping_rounds":"10",
        "num_round":"500"}

# set an output path where the trained model will be saved
# bucket = sagemaker.Session().default_bucket()
prefix = 'xgboost'
output_path = 's3://{}/{}/output'.format(bucket, prefix)

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.

# construct a SageMaker estimator that calls the xgboost-container
xgb = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                      hyperparameters=hyperparameters,
                                      role=execution_role_ARN,
                                      instance_count=1, 
                                      instance_type='ml.m5.xlarge', 
                                      volume_size=5, # 5 GB 
                                      output_path=output_path,
                                      sagemaker_session=session
                                   )

print("xgb object specified!")

### Fit the xgb estimator

In [None]:
# The label column is assumed to be the first column

# define the data type and paths to the training and validation datasets
content_type = "application/x-parquet"
prefix = "data"

train_input = TrainingInput(
                "s3://{}/{}/{}/".format(bucket, prefix, "train"),
                content_type=content_type
)
validation_input = TrainingInput(
                "s3://{}/{}/{}/".format(bucket, prefix, "validation"),
                content_type=content_type
)

# execute the XGBoost training job
xgb.fit({'train': train_input, 'validation': validation_input})

print('xgb estimator training complete')

### Load a trained model from S3 bucket

In [None]:
from sagemaker.model import Model

model_name = ""
# Define the S3 path to the model
model_url = "s3://{bucket}/xgboost/{model_name}/model.tar.gz"

# Create a SageMaker model
xgb = Model(
    model_data=model_url,
    image_uri=xgboost_container,
    role=execution_role_ARN
)

### Create the hyperparameter tuner

Now that the base estimator has been set up we need to construct a hyperparameter tuner object which we will use to request SageMaker construct a hyperparameter tuning job.

**Note:** If you don't want the hyperparameter tuning job to take too long, make sure to not set the total number of models (jobs) too high.

In [None]:
# Import the relevant objects used to construct the tuner
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
                        'max_depth': IntegerParameter(3, 12),
                        'eta'      : ContinuousParameter(0.05, 0.5),
                        'min_child_weight': IntegerParameter(2, 8),
                        'subsample': ContinuousParameter(0.5, 0.9),
                        'gamma': ContinuousParameter(0, 10)}

# create the tuner object:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:rmse', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 4, # The total number of models to train
                                               max_parallel_jobs = 2, # The number of models to train in parallel
                                               hyperparameter_ranges = hyperparameter_ranges)

print("xgb tunner object specified!")

In [None]:
### Fit the hyperparameter tuner

Now that the hyperparameter tuner object has been constructed, it is time to fit the various models and find the best performing model.

In [None]:
# The label column is assumed to be the first column

# define the data type and paths to the training and validation datasets
content_type = "application/x-parquet"
prefix = "data"

train_input = TrainingInput(
                "s3://{}/{}/{}/".format(bucket, prefix, "train"),
                content_type=content_type)
validation_input = TrainingInput(
                "s3://{}/{}/{}/".format(bucket, prefix, "validation"),
                content_type=content_type)

# execute the XGBoost training job
xgb_hyperparameter_tuner.fit({'train': train_input, 'validation': validation_input})

print('xgb estimator tunning complete')

### Putting our model to work 
Now, the model is trained. So we can attach and deploy the best trained job to an endpoint

In [None]:
# attach the model:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

In [None]:
# now we deploy the model as an endpoint
xgb_predictor = xgb_attached.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')
#xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')
print("\nxgb_predictor is ready")

In [None]:
# create sagemaker-runtime object
import boto3

runtime = boto3.Session().client('sagemaker-runtime')

In [None]:
# Try to invoke the endpoint and see if it works
# Body contains all the input features 
response = runtime.invoke_endpoint(EndpointName = endpoint_name, # The name of the endpoint we created
                                       ContentType = 'text/csv', # The data format that is expected
                                       Body = '1,6.67578125,3418,209,0.595703125,514,10.0,11,57,11,1599,0.1498791297340854,1.2884770346494763,0.22388993120700437,9.017543859649123,0.017543859649122806,46,0.02127659574468085')

In [None]:
# here is the response: the probablility of a user buy the product
response['Body'].read().decode('utf-8')