In [4]:
# Boto3 SageMaker Invoke Endpoint
# This example shows how to invoke SageMaker Endpoint from outside of AWS environment using Boto3 SDK
# Boto is the Amazon Web Services (AWS) SDK for Python
# https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

# Common Data Formats
# https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

# Endpoint: XGBoost - Kaggle Bike Rental - Regressor Trained in XGBoost Lectures
# Makesure Endpoint is deployed before running this example
# 
# Reference:
#  https://github.com/awslabs/amazon-sagemaker-examples

# NOTE: SageMaker SDK now requires additional permissions DescribeEndpoint, DescribeEndpointConfig in-addition to InvokeEndpoint
#   boto3 SDK requires just InvokeEndpoint permission.
#   Please update SageMakerInvokeEndpoint permissions to reflect this policy document:
#   Logon with my_admin account and update permissions (IAM->Policies->SageMakerInvokeEndpoint->Edit Policy)
#   
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "sagemaker:DescribeEndpointConfig",
                "sagemaker:DescribeEndpoint",
                "sagemaker:InvokeEndpoint"
            ],
            "Resource": "*"
        }
    ]
}

{'Version': '2012-10-17',
 'Statement': [{'Sid': 'VisualEditor0',
   'Effect': 'Allow',
   'Action': ['sagemaker:DescribeEndpointConfig',
    'sagemaker:DescribeEndpoint',
    'sagemaker:InvokeEndpoint'],
   'Resource': '*'}]}

In [5]:
import boto3
import math
import dateutil
import json

In [21]:
# Establish a session with AWS
# Specify credentials and region to be used for this session.
# We will use a ml_user_predict credentials that has limited privileges
boto_session = boto3.Session(profile_name='user_tmd',region_name='us-east-1')

In [22]:
# Acquire a SageMaker Runtime client for us-east-1 region
client = boto_session.client(service_name='sagemaker-runtime',region_name='us-east-1')

In [23]:
# Specify Your Endpoint Name
endpoint_name = 'xgboost-biketrain-v1'

In [24]:
# Raw Data
#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
# Actual=562
sample_one = ['2012-12-19 17:00:00',4,0,1,1,16.4,20.455,50,26.0027]
# Actual=569
sample_two = ['2012-12-19 18:00:00',4,0,1,1,15.58,19.695,50,23.9994]
# Actual=4
sample_three = ['2012-12-10 01:00:00',4,0,1,2,14.76,18.94,100,0]

In [25]:
# Single Observation
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        }
    ]
}

In [26]:
print(json.dumps(request,indent=2))

{
  "instances": [
    {
      "features": [
        "2012-12-19 17:00:00",
        4,
        0,
        1,
        1,
        16.4,
        20.455,
        50,
        26.0027
      ]
    }
  ]
}


In [27]:
# Multiple Observations
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        },
        # Second instance.
        {
            "features": sample_two
        },
        # Third instance.
        {
            "features": sample_three
        }
    ]
}

In [28]:
print(json.dumps(request,indent=2))

{
  "instances": [
    {
      "features": [
        "2012-12-19 17:00:00",
        4,
        0,
        1,
        1,
        16.4,
        20.455,
        50,
        26.0027
      ]
    },
    {
      "features": [
        "2012-12-19 18:00:00",
        4,
        0,
        1,
        1,
        15.58,
        19.695,
        50,
        23.9994
      ]
    },
    {
      "features": [
        "2012-12-10 01:00:00",
        4,
        0,
        1,
        2,
        14.76,
        18.94,
        100,
        0
      ]
    }
  ]
}


In [29]:
# Raw Data Structure: 
# datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

# Model expects data in this format (it was trained with these features):
# season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour

def transform_data(data):
    features = data.copy()
    # Extract year, month, day, dayofweek, hour
    dt = dateutil.parser.parse(features[0])

    features.append(dt.year)
    features.append(dt.month)
    features.append(dt.day)
    features.append(dt.weekday())
    features.append(dt.hour)
    
    # Return the transformed data. skip datetime field
    return ','.join([str(feature) for feature in features[1:]])

In [30]:
print('Raw Data:\n',sample_one)
print('Transformed Data:\n',transform_data(sample_one))

Raw Data:
 ['2012-12-19 17:00:00', 4, 0, 1, 1, 16.4, 20.455, 50, 26.0027]
Transformed Data:
 4,0,1,1,16.4,20.455,50,26.0027,2012,12,19,2,17


In [31]:
# Single with error
request = {
    "instances": [
        # First instance.
        {
            "features": ["hi there",0,2]
        }
    ]
}

In [32]:
try:
    transformed_data = [transform_data(instance['features']) for instance in request["instances"]]
except Exception as err:
    print('Error when transforming: {0}'.format(err))

Error when transforming: Unknown string format: hi there


In [33]:
# Single Observation
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        }
    ]
}

In [34]:
# Let's invoke prediction now
result = client.invoke_endpoint(EndpointName=endpoint_name, 
                       Body=transform_data(request['instances'][0]['features']).encode('utf-8'),
                       ContentType='text/csv')

In [35]:
result = result['Body'].read().decode('utf-8')

In [36]:
# Model was trained with log1p(count)
# So, we need to apply inverse transformation to get the actual count
# Predicted Count looks much better now
print ('Predicted Count', math.expm1(float(result)))

Predicted Count 1.329240521840346e+249


In [37]:
# Multiple Observations
request = {
    "instances": [
        # First instance.
        {
            "features": sample_one
        },
        # Second instance.
        {
            "features": sample_two
        },
        # Third instance.
        {
            "features": sample_three
        }
    ]
}

In [38]:
for instance in request["instances"]:
    print(instance)
    print('Transformed:')
    print(' ', transform_data(instance['features']))

{'features': ['2012-12-19 17:00:00', 4, 0, 1, 1, 16.4, 20.455, 50, 26.0027]}
Transformed:
  4,0,1,1,16.4,20.455,50,26.0027,2012,12,19,2,17
{'features': ['2012-12-19 18:00:00', 4, 0, 1, 1, 15.58, 19.695, 50, 23.9994]}
Transformed:
  4,0,1,1,15.58,19.695,50,23.9994,2012,12,19,2,18
{'features': ['2012-12-10 01:00:00', 4, 0, 1, 2, 14.76, 18.94, 100, 0]}
Transformed:
  4,0,1,2,14.76,18.94,100,0,2012,12,10,0,1


In [39]:
# XGBoost accepts data in CSV. It does not support JSON.
# So, we need to submit the request in CSV format
# Prediction for multiple observations in the same call
result = client.invoke_endpoint(EndpointName=endpoint_name, 
                       Body=('\n'.join(
                           [transform_data(instance['features']) 
                                for instance in request["instances"]]).encode('utf-8')),
                       ContentType='text/csv')

In [40]:
result = result['Body'].read().decode('utf-8')

In [41]:
result = result.split(',')
predictions = [math.expm1(float(r)) for r in result]

ValueError: could not convert string to float: '573.6282958984375\n547.5216064453125\n10.423816680908203\n'

In [42]:
predictions

NameError: name 'predictions' is not defined