## Setup

### Install sagemaker correct versions

In [2]:
!pip install sagemaker==2.140.1
!pip install sagemaker-experiments
from IPython.display import clear_output
clear_output()

### Setup Sagemaker session

In [3]:
import boto3
import sagemaker
from sagemaker import get_execution_role
print(sagemaker.__version__)
import sys
import IPython

role = get_execution_role()
sess = sagemaker.Session()
region = boto3.session.Session().region_name
print("Region = {}".format(region))
sm = boto3.Session().client('sagemaker')

2.140.1
Region = us-east-1


### Import tool packages

In [34]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import re

from time import sleep, gmtime, strftime
import json
import time

### Import Sagemaker Experiments

In [5]:
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

### Define the Amazon S3 buckets and folders for the project

In [6]:
rawbucket= sess.default_bucket() # Alternatively you can use our custom bucket here. 

prefix = 'sagemaker-modelmonitor' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
# traindataprefix = prefix + '/train_data'
# testdataprefix = prefix + '/test_data'
# testdatanolabelprefix = prefix + '/test_data_no_label'
# trainheaderprefix = prefix + '/train_headers'

### Load breast cancer wisconsin dataset

In [7]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
bc_data = load_breast_cancer()

X_bc_pd = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)
Y_bc_pd = pd.DataFrame(bc_data.target, columns=['Label'])

X_train, X_test, Y_train, Y_test = train_test_split(X_bc_pd, Y_bc_pd, test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25)

In [8]:
if not os.path.exists('data'):
    !mkdir data
else:
    pass

pd.concat([Y_train, X_train], axis=1).to_csv('data/train_data.csv', header=False, index=False)
pd.concat([Y_val, X_val], axis=1).to_csv('data/valid_data.csv', header=False, index=False)
pd.concat([Y_test, X_test], axis=1).to_csv('data/test_data.csv', header=False, index=False)

# Upload the raw dataset
train_data_location = sess.upload_data('data/train_data.csv', bucket=rawbucket, key_prefix=dataprefix)
valid_data_location = sess.upload_data('data/valid_data.csv', bucket=rawbucket, key_prefix=dataprefix)
test_data_location = sess.upload_data('data/test_data.csv', bucket=rawbucket, key_prefix=dataprefix)
print(train_data_location)
print(valid_data_location)
print(test_data_location)

s3://sagemaker-us-east-1-773627151292/sagemaker-modelmonitor/data/train_data.csv
s3://sagemaker-us-east-1-773627151292/sagemaker-modelmonitor/data/valid_data.csv
s3://sagemaker-us-east-1-773627151292/sagemaker-modelmonitor/data/test_data.csv


## Fit the model

In [9]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.3-1")

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_data_location, content_type='csv')
s3_input_valid = sagemaker.inputs.TrainingInput(s3_data=valid_data_location, content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=test_data_location, content_type='csv')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [11]:
xgb = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/models'.format(rawbucket, prefix),
                                    sagemaker_session=sess) # set to true for distributed training

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        verbosity=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit(inputs = {'train':s3_input_train, 'validation':s3_input_valid})
time.sleep(2)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-22-22-17-23-518


2023-03-22 22:17:24 Starting - Starting the training job...
2023-03-22 22:18:00 Starting - Preparing the instances for training.........
2023-03-22 22:19:21 Downloading - Downloading input data...
2023-03-22 22:19:57 Training - Downloading the training image...
2023-03-22 22:20:23 Training - Training image download completed. Training in progress...[34m[2023-03-22 22:20:40.055 ip-10-0-153-177.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-03-22 22:20:40.088 ip-10-0-153-177.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-03-22:22:20:40:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-03-22:22:20:40:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-22:22:20:40:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-03-22:22:20:40:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[

## Test the model

In [63]:
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.predictor import csv_serializer

sm_client = boto3.client('sagemaker')

latest_training_job = sm_client.list_training_jobs(MaxResults=1,
                                                SortBy='CreationTime',
                                                SortOrder='Descending')

training_job_name=TrainingJobName=latest_training_job['TrainingJobSummaries'][0]['TrainingJobName']

training_job_description = sm_client.describe_training_job(TrainingJobName=training_job_name)

model_data = training_job_description['ModelArtifacts']['S3ModelArtifacts']
container_uri = training_job_description['AlgorithmSpecification']['TrainingImage']

# create a model.
def create_model(role, model_name, container_uri, model_data):
    return sm_client.create_model(
        ModelName=model_name,
        PrimaryContainer={
        'Image': container_uri,
        'ModelDataUrl': model_data,
        },
        ExecutionRoleArn=role)
    

try:
    model = create_model(role, training_job_name, container_uri, model_data)
except Exception as e:
        sm_client.delete_model(ModelName=training_job_name)
        model = create_model(role, training_job_name, container_uri, model_data)
        

print('Model created: '+model['ModelArn'])

Model created: arn:aws:sagemaker:us-east-1:773627151292:model/sagemaker-xgboost-2023-03-22-22-17-23-518


In [64]:
# Endpoint Config name
endpoint_config_name = f"{training_job_name}-endpoint-config"

# Endpoint config parameters
production_variant_dict = {
                           "VariantName": "Alltraffic",
                           "ModelName": training_job_name,
                           "InitialInstanceCount": 1,
                           "InstanceType": "ml.m5.xlarge",
                           "InitialVariantWeight": 1
                          }

# Define bucket path
write_bucket = sess.default_bucket()
write_prefix = "breast-cancer-demo"
data_capture_key = f"{write_prefix}/data-capture"
data_capture_uri = f"s3://{write_bucket}/{data_capture_key}"

# Data capture config parameters
data_capture_config_dict = {
                            "EnableCapture": True,
                            "InitialSamplingPercentage": 100,
                            "DestinationS3Uri": data_capture_uri,
                            "CaptureOptions": [{"CaptureMode" : "Input"}, {"CaptureMode" : "Output"}]
                           }


# Create endpoint config if one with the same name does not exist
endpoint_config_matches = sm_client.list_endpoint_configs(NameContains=endpoint_config_name)["EndpointConfigs"]
if not endpoint_config_matches:
    endpoint_config_response = sm_client.create_endpoint_config(
                                                                EndpointConfigName=endpoint_config_name,
                                                                ProductionVariants=[production_variant_dict],
                                                                DataCaptureConfig=data_capture_config_dict
                                                               )
else:
    print(f"Endpoint config with name {endpoint_config_name} already exists! Change endpoint config name to create new")

In [65]:
endpoint_name = f"{training_job_name}-endpoint"

endpoint_matches = sm_client.list_endpoints(NameContains=endpoint_name)["Endpoints"]
if not endpoint_matches:
    endpoint_response = sm_client.create_endpoint(
                                                  EndpointName=endpoint_name,
                                                  EndpointConfigName=endpoint_config_name
                                                 )
else:
    print(f"Endpoint with name {endpoint_name} already exists! Change endpoint name to create new")

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
while status == "Creating":
    print(f"Endpoint Status: {status}...")
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
print(f"Endpoint Status: {status}")

Endpoint Status: Creating...
Endpoint Status: Creating...
Endpoint Status: Creating...
Endpoint Status: InService


In [66]:
sm_runtime_client = boto3.client("sagemaker-runtime")

# Fetch test data to run predictions with the endpoint
test_df = pd.read_csv(test_data_location, names = [str(x) for x in range(31)])
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,...,18.07,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946
1,0,20.47,20.67,134.7,1299.0,0.09156,0.1313,0.1523,0.1015,0.2166,...,23.23,27.15,152.0,1645.0,0.1097,0.2534,0.3092,0.1613,0.322,0.06386
2,1,12.34,12.27,78.94,468.5,0.09003,0.06307,0.02958,0.02647,0.1689,...,13.61,19.27,87.22,564.9,0.1292,0.2074,0.1791,0.107,0.311,0.07592
3,1,17.85,13.23,114.6,992.1,0.07838,0.06217,0.04445,0.04178,0.122,...,19.82,18.42,127.1,1210.0,0.09862,0.09976,0.1048,0.08341,0.1783,0.05871
4,1,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,...,13.32,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024


In [67]:

# For content type text/csv, payload should be a string with commas separating the values for each feature
# This is the inference request serialization step
# CSV serialization
csv_file = io.StringIO()
test_sample = test_df.drop(["0"], axis=1)
test_sample.to_csv(csv_file, sep=",", header=False, index=False)
payload = csv_file.getvalue()
response = sm_runtime_client.invoke_endpoint(
                                             EndpointName=endpoint_name,
                                             Body=payload,
                                             ContentType="text/csv",
                                             Accept="text/csv"
                                            )

# This is the inference response deserialization step
# This is a bytes object
result = response["Body"].read()
# Decoding bytes to a string
result = result.decode("utf-8")
# Converting to list of predictions
result = re.split(",|\n",result)
result.pop()

''

In [68]:
result = [float(x) for x in result]

In [59]:
output_df = pd.DataFrame(result)
output_df['Predicted']=np.round(output_df.values)
output_df['Label'] = test_df["0"].values
output_df

Unnamed: 0,0,Predicted,Label
0,0.037244,0.0,0
1,0.014608,0.0,0
2,0.987058,1.0,1
3,0.436265,0.0,1
4,0.987199,1.0,1
...,...,...,...
109,0.801476,1.0,1
110,0.983925,1.0,1
111,0.049463,0.0,0
112,0.049463,0.0,0


In [69]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix = pd.crosstab(output_df['Predicted'], output_df['Label'], rownames=['Actual'], colnames=['Predicted'], margins = True)
confusion_matrix

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,38,2,40
1.0,4,70,74
All,42,72,114


In [70]:
prediction_df = pd.DataFrame()
prediction_df["Prediction"] = result[:5]
prediction_df["Label"] = test_df["0"].iloc[:5].values
prediction_df

Unnamed: 0,Prediction,Label
0,0.037244,0
1,0.014608,0
2,0.987058,1
3,0.436265,1
4,0.987199,1


In [62]:
# Delete model
sm_client.delete_model(ModelName=training_job_name)

# Delete endpoint configuration
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)

# Delete endpoint
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '1582985a-93cd-45fa-8358-f73cd253bafd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1582985a-93cd-45fa-8358-f73cd253bafd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Wed, 22 Mar 2023 23:14:49 GMT'},
  'RetryAttempts': 0}}