## Load packages

In [2]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
import yaml
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions




## Load Configs

In [3]:
role = get_execution_role()
region = boto3.Session().region_name

feature_group_name = "taxi-feature-group-no-encoding"
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/xgboost_model"

In [4]:
bucket

'sagemaker-ap-southeast-2-949757562939'

## Get data from Feature Store

In [5]:
# Connect to the Feature Store
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [6]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "taxi_feature_group_no_encoding_1693400933"


In [7]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

In [8]:
model_data = model_data.drop(['fs_id', 'fs_time', 'write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [9]:
model_data.head()

Unnamed: 0,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,tips,tolls,extras,trip_total,payment_type,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,start_month,start_day,start_hour,start_minute
0,1513.0,14.65,56,8,2.0,0,5,45.0,credit_card,41.792592,-87.769615,41.899602,-87.633308,1,30,23,0
1,1260.0,16.3,76,8,9.0,0,4,53.75,credit_card,41.980264,-87.913625,41.899602,-87.633308,1,30,23,0
2,1082.0,9.39,33,77,0.0,0,0,25.25,prcard,41.857184,-87.620335,41.986712,-87.663416,1,30,23,0
3,1144.0,11.97,8,40,0.0,0,0,31.0,prcard,41.899602,-87.633308,41.792357,-87.617931,1,30,23,0
4,731.0,4.33,32,6,0.0,0,0,14.25,cash,41.878866,-87.625192,41.944227,-87.655998,1,30,23,0


In [10]:
model_data.shape

(174724, 17)

In [11]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174724 entries, 0 to 174723
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   trip_seconds                174695 non-null  float64
 1   trip_miles                  174724 non-null  float64
 2   pickup_community_area       174724 non-null  int64  
 3   dropoff_community_area      174724 non-null  int64  
 4   tips                        174724 non-null  float64
 5   tolls                       174724 non-null  int64  
 6   extras                      174724 non-null  int64  
 7   trip_total                  174724 non-null  float64
 8   payment_type                174724 non-null  object 
 9   pickup_centroid_latitude    174724 non-null  float64
 10  pickup_centroid_longitude   174724 non-null  float64
 11  dropoff_centroid_latitude   174724 non-null  float64
 12  dropoff_centroid_longitude  174724 non-null  float64
 13  start_month   

## Create dummies

In [12]:
model_data_dum = pd.get_dummies(model_data)

In [13]:
model_data_dum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174724 entries, 0 to 174723
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   trip_seconds                174695 non-null  float64
 1   trip_miles                  174724 non-null  float64
 2   pickup_community_area       174724 non-null  int64  
 3   dropoff_community_area      174724 non-null  int64  
 4   tips                        174724 non-null  float64
 5   tolls                       174724 non-null  int64  
 6   extras                      174724 non-null  int64  
 7   trip_total                  174724 non-null  float64
 8   pickup_centroid_latitude    174724 non-null  float64
 9   pickup_centroid_longitude   174724 non-null  float64
 10  dropoff_centroid_latitude   174724 non-null  float64
 11  dropoff_centroid_longitude  174724 non-null  float64
 12  start_month                 174724 non-null  int64  
 13  start_day     

In [14]:
pd.set_option('display.max_columns', None)
model_data_dum.head()

Unnamed: 0,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,start_month,start_day,start_hour,start_minute,payment_type_cash,payment_type_credit_card,payment_type_dispute,payment_type_mobile,payment_type_no_charge,payment_type_prcard,payment_type_unknown
0,1513.0,14.65,56,8,2.0,0,5,45.0,41.792592,-87.769615,41.899602,-87.633308,1,30,23,0,0,1,0,0,0,0,0
1,1260.0,16.3,76,8,9.0,0,4,53.75,41.980264,-87.913625,41.899602,-87.633308,1,30,23,0,0,1,0,0,0,0,0
2,1082.0,9.39,33,77,0.0,0,0,25.25,41.857184,-87.620335,41.986712,-87.663416,1,30,23,0,0,0,0,0,0,1,0
3,1144.0,11.97,8,40,0.0,0,0,31.0,41.899602,-87.633308,41.792357,-87.617931,1,30,23,0,0,0,0,0,0,1,0
4,731.0,4.33,32,6,0.0,0,0,14.25,41.878866,-87.625192,41.944227,-87.655998,1,30,23,0,1,0,0,0,0,0,0


## Prepare Dataset for Training, Test, and Validation

In [15]:
# Randomly sort the data then split out first 70%, second 20%, and last 10%
# train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])
train_data, validation_data, test_data = np.split(model_data_dum.sample(frac=1, random_state=1729), [int(0.7 * len(model_data_dum)), int(0.9 * len(model_data_dum))])

In [16]:
# Move the target variable to the first column
pd.concat([train_data['trip_total'], train_data.drop(['trip_total'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([validation_data['trip_total'], validation_data.drop(['trip_total'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

In [17]:
# Upload the training and validation dataset into S3 bucket.
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

## Training

In [18]:
# Select the xgboost built in container
container = sagemaker.image_uris.retrieve(region=region, framework='xgboost', version='1.7-1')

# Get the input and validation path
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

sess = sagemaker.Session()

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"500"}

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess,
                                    hyperparameters=hyperparameters
                                   )

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-08-31-04-17-30-930


2023-08-31 04:17:31 Starting - Starting the training job......
2023-08-31 04:18:06 Starting - Preparing the instances for training......
2023-08-31 04:19:14 Downloading - Downloading input data...
2023-08-31 04:19:39 Training - Downloading the training image...
2023-08-31 04:20:25 Training - Training image download completed. Training in progress....[34m[2023-08-31 04:20:41.831 ip-10-0-243-181.ap-southeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-08-31 04:20:41.859 ip-10-0-243-181.ap-southeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-08-31:04:20:42:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-08-31:04:20:42:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-08-31:04:20:42:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-08-31:04:20:42:INFO] Running XG

## Host the model

In [19]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-08-31-04-22-13-849
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-08-31-04-22-13-849
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-08-31-04-22-13-849


------!

## Test the end point

In [20]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [21]:
def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [22]:
data = test_data.drop(['trip_total'], axis=1).to_numpy()

In [23]:
# Show the first 10 rows prediction and true value
for i, d in enumerate(data[:10]):
    # print(d)
    pred = predict(d, xgb_predictor)[0]
    true_value = test_data["trip_total"].reset_index().iloc[i, 1]
    print("Prediction value", pred, "and true value:", true_value)

Prediction value 9.125707626342773 and true value: 10.49
Prediction value 58.38532257080078 and true value: 58.5
Prediction value 43.28390121459961 and true value: 42.9
Prediction value 48.146820068359375 and true value: 45.0
Prediction value 40.64889907836914 and true value: 40.75
Prediction value 28.783700942993164 and true value: 28.0
Prediction value 57.06315612792969 and true value: 58.6
Prediction value 11.045339584350586 and true value: 12.0
Prediction value 48.19758987426758 and true value: 48.75
Prediction value 8.260760307312012 and true value: 8.5


## Cleanup

In [24]:
# xgb_predictor.delete_endpoints(delete_endpoint_config=True)