## Load packages

In [2]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
import yaml
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions




## Load Configs

In [3]:
role = get_execution_role()
region = boto3.Session().region_name

feature_group_name = "taxi-feature-group-no-encoding"
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/xgboost_model"

In [4]:
bucket

'sagemaker-ap-southeast-2-949757562939'

## Get data from Feature Store

In [5]:
# Connect to the Feature Store
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [6]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "taxi_feature_group_no_encoding_1693400933"


In [7]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

In [8]:
model_data = model_data.drop(['fs_id', 'fs_time', 'write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [9]:
model_data.head()

Unnamed: 0,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,tips,tolls,extras,trip_total,payment_type,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,start_month,start_day,start_hour,start_minute
0,1697.0,12.08,56,8,0.07,0,4,36.57,credit_card,41.785999,-87.750934,41.892042,-87.631864,6,23,23,0
1,660.0,0.2,32,7,4.5,0,1,17.0,credit_card,41.878866,-87.625192,41.922686,-87.649489,6,23,23,0
2,535.0,0.87,6,6,3.0,0,0,10.25,credit_card,41.944227,-87.655998,41.944227,-87.655998,6,23,23,0
3,800.0,1.99,32,8,3.28,0,2,15.67,mobile,41.878866,-87.625192,41.899602,-87.633308,6,23,23,0
4,1140.0,0.0,56,35,6.3,0,5,37.3,credit_card,41.792592,-87.769615,41.835118,-87.618678,6,23,23,0


In [10]:
model_data.shape

(174724, 17)

## Prepare Dataset for Training, Test, and Validation

In [11]:
# Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])

In [12]:
# Move the target variable to the first column
pd.concat([train_data['trip_total'], train_data.drop(['trip_total'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([validation_data['trip_total'], validation_data.drop(['trip_total'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

In [13]:
# Upload the training and validation dataset into S3 bucket.
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

## Training

In [14]:
# Select the xgboost built in container
container = sagemaker.image_uris.retrieve(region=region, framework='xgboost', version='1.7-1')

# Get the input and validation path
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

sess = sagemaker.Session()

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"500"}

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess,
                                    hyperparameters=hyperparameters
                                   )

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-08-31-00-53-30-347


2023-08-31 00:53:30 Starting - Starting the training job...
2023-08-31 00:53:45 Starting - Preparing the instances for training......
2023-08-31 00:54:54 Downloading - Downloading input data...
2023-08-31 00:55:19 Training - Downloading the training image......
2023-08-31 00:56:10 Training - Training image download completed. Training in progress.[34m[2023-08-31 00:56:23.891 ip-10-0-67-89.ap-southeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-08-31 00:56:23.926 ip-10-0-67-89.ap-southeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-08-31:00:56:24:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-08-31:00:56:24:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-08-31:00:56:24:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-08-31:00:56:24:INFO] Running XGBoost S

## Host the model

In [15]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-08-31-00-57-43-765
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-08-31-00-57-43-765
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-08-31-00-57-43-765


-----!

## Cleanup

In [16]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-08-31-00-57-43-765
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-08-31-00-57-43-765
