<h1>XGBoost Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

This Stackoverflow answer is useful:
https://stackoverflow.com/a/51086736

In [3]:
# SDK 2
# RealTimePredictor renamed to Predictor
# https://sagemaker.readthedocs.io/en/stable/v2.html

# Create a predictor and point to an existing endpoint
endpoint_name = 'xgboost-course-feb22'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [4]:
predictor.serializer = CSVSerializer()

In [5]:
df_all = pd.read_csv('../Data/bike_test.csv')

In [6]:
df_all.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [7]:
df_all.columns[1:]

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour'],
      dtype='object')

In [8]:
# Need to pass an array to the prediction
# can pass a numpy array or a list of values [[19,1],[20,1]]
arr_test = df_all[df_all.columns[1:]].values

In [9]:
type(arr_test)

numpy.ndarray

In [10]:
arr_test.shape

(6493, 13)

In [11]:
arr_test[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [12]:
result = predictor.predict(arr_test[:2])

In [13]:
result

b'2.589771270751953,1.8368737697601318'

In [14]:
arr_test.shape

(6493, 13)

### Split the input data into chunks
There are thousands of rows in this data set for which need inference.  
When communicating over internet, it is a good idea to split the data into chunks to prevent payload and timeout error

In [15]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    predictions += [float(r) for r in result]

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [16]:
len(predictions)

6493

In [17]:
np.expm1(predictions)

array([ 12.32672304,   5.27688457,   3.47940409, ..., 135.70009781,
       108.74114976,  53.34338759])

In [18]:
df_all['count'] = np.expm1(predictions)

In [19]:
df_all.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0,12.326723
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1,5.276885
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2,3.479404
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,2.230289
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4,2.116289


In [20]:
df_all[['datetime','count']].to_csv('../Data/predicted_count_cloud.csv',index=False)

In [None]:
# Delete Endpoint to prevent unnecessary charges
predictor.delete_endpoint()

In [21]:
predictions

[2.589771270751953,
 1.8368737697601318,
 1.4994900226593018,
 1.1725716590881348,
 1.1366429328918457,
 2.2046689987182617,
 3.3498733043670654,
 4.354689598083496,
 5.088196754455566,
 4.733828067779541,
 3.9883992671966553,
 4.017438888549805,
 4.217536926269531,
 4.188163757324219,
 4.151099681854248,
 4.248010635375977,
 4.482417106628418,
 5.07808256149292,
 5.0302886962890625,
 4.635706901550293,
 4.338649749755859,
 4.066508769989014,
 3.499997138977051,
 3.1057233810424805,
 2.686417579650879,
 2.1052627563476562,
 1.725441813468933,
 0.9196416139602661,
 1.1633021831512451,
 1.7262060642242432,
 3.3151869773864746,
 4.058823585510254,
 4.961440563201904,
 4.46460485458374,
 3.7753257751464844,
 3.9365196228027344,
 4.018100738525391,
 3.9153459072113037,
 3.931674003601074,
 3.9988858699798584,
 4.2511420249938965,
 4.658257961273193,
 4.59789514541626,
 4.235307693481445,
 3.885457992553711,
 3.5398011207580566,
 3.2939951419830322,
 3.1006083488464355,
 2.9626595973968506,
