In [1]:
"""
https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-train-model.html
"""
import os
os.getcwd()

'/home/ec2-user/SageMaker/AWSexample'

In [2]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## create s3 bucket
here we create a specific bucket "examplecaodata", or we could use the default bucket `bucket = sagemaker.Session().default_bucket()`.

In [11]:
bucket_name = "examplecaodata"
myregion=sagemaker.Session().boto_region_name
role = sagemaker.get_execution_role()
print("AWS Region: {}".format(myregion),"RoleArn: {}".format(role), sep="\n")

AWS Region: us-east-2
RoleArn: arn:aws:iam::339712797173:role/service-role/AmazonSageMaker-ExecutionRole-20240618T005267


In [6]:
#!aws s3 mb s3://examplecaodata --region {myregion}

In [5]:
# create s3 bucket data folder
s3 = boto3.resource('s3')
try:
    if myregion=="us-east-2":
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': myregion})
    print("S3 bucket created!")
except Exception as e:
    print("error:",e)

error: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [24]:
# output path
prefix="xgboost-as-a-built-in-algo"
output_path=f"s3://{bucket_name}/{prefix}/output"
print(output_path)

s3://examplecaodata/xgboost-as-a-built-in-algo/output


## load data

In [8]:
import pandas as pd
import urllib
data_url = "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
try:
    urllib.request.urlretrieve(data_url, "bank_clean.csv")
except Exception as e:
    print("data load error:",e)

model_data = pd.read_csv("./bank_clean.csv",index_col=0)
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


## data split
here we simplify the test dataset as both validation set and OOT set...

In [14]:
from sklearn.model_selection import train_test_split
data = model_data.drop(['y_no', 'y_yes'], axis="columns")
labels = model_data['y_yes']
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.30, random_state=1729)

len(data_train),len(data_test)

(28831, 12357)

In [18]:
# re-build train,test dataset and save to csv
pd.concat([labels_train,data_train], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([labels_test,data_test], axis=1).to_csv('test.csv', index=False, header=False)

## mapping train/test data to S3 Bucket

In [20]:
# upload data into s3 buckets
# declare the data path

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

In [22]:
! aws s3 ls {bucket_name}/{prefix}/ --recursive

2024-06-18 21:38:53    1519426 xgboost-as-a-built-in-algo/test/test.csv
2024-06-18 21:38:53    3545018 xgboost-as-a-built-in-algo/train/train.csv


## train in-built xgboost model

In [19]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
#container = sagemaker.image_uris.retrieve("xgboost", myregion, "latest")
container = get_image_uri(region_name = boto3.Session().region_name,
                          repo_name = 'xgboost', 
                          repo_version='latest')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [20]:
print(container)

825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest


In [22]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.8",
        "objective":"binary:logistic",
        "num_round":50
        }

In [25]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', #Type of EC2 instance to use for training
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

In [28]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: xgboost-2024-06-18-22-09-30-909


2024-06-18 22:09:31 Starting - Starting the training job...
2024-06-18 22:09:44 Starting - Preparing the instances for training...
2024-06-18 22:10:30 Downloading - Downloading the training image......
2024-06-18 22:11:31 Training - Training image download completed. Training in progress.
2024-06-18 22:11:31 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-06-18:22:11:24:INFO] Running standalone xgboost training.[0m
[34m[2024-06-18:22:11:24:INFO] File size need to be processed in the node: 4.83mb. Available memory size in the node: 23884.43mb[0m
[34m[2024-06-18:22:11:24:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:11:24] S3DistributionType set as FullyReplicated[0m
[34m[22:11:24] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-06-18:22:11:24:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:11:24] S3DistributionType set as FullyReplicated[0m


## training outputs

In [39]:
! aws s3 ls {bucket_name}/{prefix}/output/ --recursive | grep model

2024-06-18 22:11:37      37126 xgboost-as-a-built-in-algo/output/xgboost-2024-06-18-22-09-30-909/output/model.tar.gz


In [26]:
estimator.output_path

's3://examplecaodata/xgboost-as-a-built-in-algo/output'

In [None]:
model_result_path = estimator.output_path + "/" + estimator.latest_training_job.job_name + "/output"
! aws s3 ls {model_result_path} --recursive

In [None]:
#Download the Debugger XGBoost training and profiling reports to the current workspace
! aws s3 cp {model_result_path} ./ --recursive

In [None]:
#find the location of the model artifact
estimator.model_data

## Deploy Machine leanring Model as Endpoints

The deploy method creates a deployable model, configures the SageMaker hosting services endpoint, and launches the endpoint to host the model.

Instead of hosting an endpoint in production, you can run a one-time batch inference job to make predictions on a test dataset using the SageMaker batch transform. [Make Prediction with Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-model-deployment.html#ex1-batch-transform)

In [50]:
from sagemaker.serializers import CSVSerializer
#Serialize input data of various formats (a NumPy array, list, file, or buffer) to a CSV-formatted string. 
#We use this because the XGBoost algorithm accepts input files in CSV format.

xgb_predictor = estimator.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge',
                                 serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: xgboost-2024-06-18-22-38-08-230
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-06-18-22-38-08-230
INFO:sagemaker:Creating endpoint with name xgboost-2024-06-18-22-38-08-230


------!

## Prediction

endpoint receive the data through a serializer.


In [53]:
import numpy as np
test_data_array = data_test.to_numpy() #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference

predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',')
print(predictions_array.shape)

(12357,)


In [55]:
xgb_predictor.endpoint_name

'xgboost-2024-06-18-22-38-08-230'

## Evaluation

In [64]:
import sklearn

# confusion matrix
cutoff=0.5
print(sklearn.metrics.confusion_matrix(labels_test, np.where(predictions_array > cutoff, 1, 0)))

print(sklearn.metrics.classification_report(labels_test, np.where(predictions_array > cutoff, 1, 0)))

[[10831   201]
 [ 1024   301]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.95     11032
           1       0.60      0.23      0.33      1325

    accuracy                           0.90     12357
   macro avg       0.76      0.60      0.64     12357
weighted avg       0.88      0.90      0.88     12357



## Clean up

In [63]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint_name)

bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: xgboost-2024-06-18-22-38-08-230


[{'ResponseMetadata': {'RequestId': 'JGECMETN2H5EHTD9',
   'HostId': '/KOOzKg1pMFpvPG/dDOaGcvzmx0iYRXPMuiZngC90SmzL4Bg7/N/dtX/FpICfxNzHChchpLwA34nvhQT9ep5WA==',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '/KOOzKg1pMFpvPG/dDOaGcvzmx0iYRXPMuiZngC90SmzL4Bg7/N/dtX/FpICfxNzHChchpLwA34nvhQT9ep5WA==',
    'x-amz-request-id': 'JGECMETN2H5EHTD9',
    'date': 'Tue, 18 Jun 2024 22:55:07 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/xgboost-2024-06-18-22-09-30-909/profiler-output/system/incremental/2024061822/1718748600.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/xgboost-2024-06-18-22-09-30-909/profiler-output/framework/training_job_end.ts'},
   {'Key': 'xgboost-as-a-built-in-algo/train/train.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/xgboost-2024-06-18-22-09-30-909/profiler-output/system