## Load packages

In [2]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
import yaml
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions




## Load Configs

In [7]:
role = get_execution_role()
region = boto3.Session().region_name

feature_group_name = "taxi-feature-group"
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/xgboost_model"

In [8]:
bucket

'sagemaker-ap-southeast-2-949757562939'

## Get data from Feature Store

In [9]:
# Connect to the Feature Store
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [10]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "taxi_feature_group_1692861713"


In [11]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

In [12]:
model_data = model_data.drop(['fs_id', 'fs_time', 'write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [13]:
model_data.head()

Unnamed: 0,trip_seconds,trip_miles,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,...,dropoff_community_area_65,dropoff_community_area_62,dropoff_community_area_67,dropoff_community_area_48,dropoff_community_area_58,dropoff_community_area_52,dropoff_community_area_18,dropoff_community_area_74,dropoff_community_area_63,dropoff_community_area_47
0,1470.0,18.8,0.0,0,6,52.25,41.980264,-87.913625,41.857184,-87.620335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1406.0,12.5,0.0,0,0,32.25,41.944227,-87.655998,41.79409,-87.592311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1462.0,12.57,9.31,0,4,46.56,41.785999,-87.750934,41.884987,-87.620993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,660.0,0.1,0.0,0,1,9.75,41.899602,-87.633308,41.878866,-87.625192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1241.0,11.82,0.0,0,0,30.75,41.842076,-87.633973,41.745758,-87.708366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
model_data.shape

(174724, 178)

## Prepare Dataset for Training, Test, and Validation

In [16]:
# Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])

In [17]:
# Move the target variable to the first column
pd.concat([train_data['trip_total'], train_data.drop(['trip_total'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([validation_data['trip_total'], validation_data.drop(['trip_total'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

In [18]:
# Upload the training and validation dataset into S3 bucket.
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

## Training

In [22]:
# Select the xgboost built in container
container = sagemaker.image_uris.retrieve(region=region, framework='xgboost', version='1.7-1')

# Get the input and validation path
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

sess = sagemaker.Session()

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"500"}

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess,
                                    hyperparameters=hyperparameters
                                   )

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-08-24-09-07-21-493


2023-08-24 09:07:21 Starting - Starting the training job.........
2023-08-24 09:08:47 Starting - Preparing the instances for training......
2023-08-24 09:09:49 Downloading - Downloading input data......
2023-08-24 09:10:24 Training - Downloading the training image...
2023-08-24 09:11:10 Training - Training image download completed. Training in progress..[34m[2023-08-24 09:11:20.331 ip-10-0-141-178.ap-southeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-08-24 09:11:20.364 ip-10-0-141-178.ap-southeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-08-24:09:11:20:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-08-24:09:11:20:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-08-24:09:11:20:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-08-24:09:11:20:INFO] Runnin

## Host the model

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-08-24-09-23-10-319
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-08-24-09-23-10-319
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-08-24-09-23-10-319


-----!

## Cleanup

In [26]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-08-24-09-23-10-319
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-08-24-09-23-10-319
