In [2]:
!pip3 install -U sagemaker

[0m

In [3]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "DEMO-breast-cancer-prediction-xgboost-highlevel"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
print(f'Default S3 bucket: {bucket}')

Default S3 bucket: sagemaker-us-east-1-652903355321


---
## Data sources

> Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

> Breast Cancer Wisconsin (Diagnostic) Data Set [https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)].

> _Also see:_ Breast Cancer Wisconsin (Diagnostic) Data Set [https://www.kaggle.com/uciml/breast-cancer-wisconsin-data].

## Data preparation


Let's download the data and save it in the local folder with the name data.csv and take a look at it.

In [5]:
import pandas as pd
import numpy as np

s3 = boto3.client("s3")

filename = "wdbc.csv"
s3.download_file(
    f"sagemaker-example-files-prod-{region}", "datasets/tabular/breast_cancer/wdbc.csv", filename
)
data = pd.read_csv(filename, header=None)

# specify columns extracted from wbdc.names
data.columns = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
]

# save the data
data.to_csv("data.csv", sep=",", index=False)

data.sample(8)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
496,914366,B,12.65,18.17,82.69,485.6,0.1076,0.1334,0.08017,0.05074,...,14.38,22.15,95.29,633.7,0.1533,0.3842,0.3582,0.1407,0.323,0.1033
184,873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,...,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
92,861853,B,13.27,14.76,84.74,551.7,0.07355,0.05055,0.03261,0.02648,...,16.36,22.35,104.5,830.6,0.1006,0.1238,0.135,0.1001,0.2027,0.06206
341,898143,B,9.606,16.84,61.64,280.5,0.08481,0.09228,0.08422,0.02292,...,10.75,23.07,71.25,353.6,0.1233,0.3416,0.4341,0.0812,0.2982,0.09825
170,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
350,899187,B,11.66,17.07,73.7,421.0,0.07561,0.0363,0.008306,0.01162,...,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
58,857810,B,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,...,14.23,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
331,896864,B,12.98,19.35,84.52,514.0,0.09579,0.1125,0.07107,0.0295,...,14.42,21.95,99.21,634.3,0.1288,0.3253,0.3439,0.09858,0.3596,0.09166


In [6]:
data.shape

(569, 32)

#### Key observations:
* The data has 569 observations and 32 columns.
* The first field is the 'id' attribute that we will want to drop before batch inference and add to the final inference output next to the probability of malignancy.
* Second field, 'diagnosis', is an indicator of the actual diagnosis ('M' = Malignant; 'B' = Benign).
* There are 30 other numeric features that we will use for training and inferencing.

Let's replace the M/B diagnosis with a 1/0 boolean value. 

In [7]:
data["diagnosis"] = data["diagnosis"].apply(lambda x: ((x == "M")) + 0)
data.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
174,871642,0,10.66,15.15,67.49,349.6,0.08792,0.04302,0.0,0.0,...,11.54,19.2,73.2,408.3,0.1076,0.06791,0.0,0.0,0.271,0.06164
319,894335,0,12.43,17.0,78.6,477.3,0.07557,0.03454,0.01342,0.01699,...,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
153,87106,0,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,...,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
302,89263202,1,20.09,23.86,134.7,1247.0,0.108,0.1838,0.2283,0.128,...,23.68,29.43,158.8,1696.0,0.1347,0.3391,0.4932,0.1923,0.3294,0.09469
525,91805,0,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,...,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049


In [8]:
data.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
389,90312,1,19.55,23.21,128.9,1174.0,0.101,0.1318,0.1856,0.1021,...,20.82,30.44,142.0,1313.0,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
157,8711216,0,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,...,18.22,28.07,120.3,1032.0,0.08774,0.171,0.1882,0.08436,0.2527,0.05972
95,86208,1,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,...,24.22,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
409,905501,0,12.27,17.92,78.41,466.1,0.08685,0.06526,0.03211,0.02653,...,14.1,28.88,89.0,610.2,0.124,0.1795,0.1377,0.09532,0.3455,0.06896
209,8810436,0,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,...,17.38,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474


Let's split the data as follows: 80% for training, 10% for validation and let's set 10% aside for our batch inference job. In addition, let's drop the 'id' field on the training set and validation set as 'id' is not a training feature. For our batch set however, we keep the 'id' feature. We'll want to filter it out prior to running our inferences so that the input data features match the ones of training set and then ultimately, we'll want to join it with inference result. We are however dropping the diagnosis attribute for the batch set since this is what we'll try to predict.

In [9]:
# data split in three sets, training, validation and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
batch_list = rand_split >= 0.9

data_train = data[train_list].drop(["id"], axis=1)
data_val = data[val_list].drop(["id"], axis=1)
data_batch = data[batch_list].drop(["diagnosis"], axis=1)
data_batch_noID = data_batch.drop(["id"], axis=1)

Let's upload those data sets in S3

In [10]:
train_file = "train_data.csv"
data_train.to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

validation_file = "validation_data.csv"
data_val.to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

batch_file = "batch_data.csv"
data_batch.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, key_prefix="{}/batch".format(prefix))

batch_file_noID = "batch_data_noID.csv"
data_batch_noID.to_csv(batch_file_noID, index=False, header=False)
sess.upload_data(batch_file_noID, key_prefix="{}/batch".format(prefix))

's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/batch/batch_data_noID.csv'

---

## Training job and model creation

The below cell uses the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick off the training job using both our training set and validation set. Not that the objective is set to 'binary:logistic' which trains a model to output a probability between 0 and 1 (here the probability of a tumor being malignant).

In [11]:
%%time
from time import gmtime, strftime

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

train_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validation".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-2024-05-23-16-27-19


2024-05-23 16:27:20 Starting - Starting the training job...
2024-05-23 16:27:35 Starting - Preparing the instances for training...
2024-05-23 16:28:06 Downloading - Downloading input data...
2024-05-23 16:28:31 Downloading - Downloading the training image...
2024-05-23 16:29:11 Training - Training image download completed. Training in progress..[34m[2024-05-23 16:29:19.535 ip-10-0-170-71.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-23 16:29:19.557 ip-10-0-170-71.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-23:16:29:19:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-23:16:29:19:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-23:16:29:19:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-05-23:16:29:19:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[20

---

## Batch Transform

In SageMaker Batch Transform, we introduced 3 new attributes - __input_filter__, __join_source__ and __output_filter__. In the below cell, we use the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick-off several Batch Transform jobs using different configurations of these 3 new attributes. Please refer to [this page](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html) to learn more about how to use them.




#### 1. Create a transform job with the default configurations
Let's first skip these 3 new attributes and inspect the inference results. We'll use it as a baseline to compare to the results with data processing.

Let's inspect the output of the Batch Transform job in S3. It should show the list probabilities of tumors being malignant.

In [12]:
import re


def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

#### 1. Predict Results

* Set __input_filter__ to "$[1:]": indicates that we are excluding column 0 (the 'ID') before processing the inferences and keeping everything from column 1 to the last column (all the features or predictors)  
    
* Set __join_source__ to "Input": indicates our desire to join the input data with the inference results  

* Leave __output_filter__ to default ('$'), indicating that the joined input and inference results be will saved as output.

In [13]:
# content_type / accept and split_type / assemble_with are required to use IO joining feature

sm_transformer = sm_estimator.transformer(1, "ml.m4.xlarge")

sm_transformer.assemble_with = "Line"
sm_transformer.accept = "text/csv"

# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file
)  # use input data with ID column cause InputFilter will filter it out
sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",
    join_source="Input",
)
sm_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-05-23-16-30-02-381
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2024-05-23-16-30-03-134


...........................................[34m[2024-05-23:16:37:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-05-23:16:37:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-05-23:16:37:14:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    

Let's inspect the output of the Batch Transform job in S3. It should show the list of tumors identified by their original feature columns and their corresponding probabilities of being malignant.

In [14]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,849014,19.81,22.15,130.0,1260.0,0.09831,0.1027,0.1479,0.09498,0.1582,...,30.88,186.8,2398.0,0.1512,0.315,0.5372,0.2388,0.2768,0.07615,0.991865
1,852631,17.14,16.4,116.0,912.7,0.1186,0.2276,0.2229,0.1401,0.304,...,21.4,152.4,1461.0,0.1545,0.3949,0.3853,0.255,0.4066,0.1059,0.961398
2,852973,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,0.994575
3,854039,16.13,17.88,107.0,807.2,0.104,0.1559,0.1354,0.07752,0.1998,...,27.26,132.7,1261.0,0.1446,0.5804,0.5274,0.1864,0.427,0.1233,0.990722
4,855133,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,...,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,0.022444
5,855138,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,0.172,...,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071,0.831802
6,856106,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,...,28.0,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027,0.96094
7,857343,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,...,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563,0.01111


create_model(role=role, image_uri=XGBOOST_IMAGE)In summary, we can use newly introduced 3 attributes - __input_filter__, __join_source__, __output_filter__ to 
1. Filter / select useful features from the input dataset. e.g. exclude ID columns.
2. Associate the prediction results with their corresponding input records.
3. Filter the original or joined results before saving to S3. e.g. keep ID and probability columns only.

## Upload the Sagemaker Model created during our training job to the Sagemaker Model Registry

In [15]:
sagemaker = boto3.client("sagemaker")

model_name = job_name
#model_name = "sagemaker-xgboost-2023-08-22-05-28-37-903"
print(model_name)


info = sagemaker.describe_training_job(TrainingJobName=model_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]

primary_container = {"Image": image, "ModelDataUrl": model_data}

# Save our model to the Sagemaker Model Registry
create_model_response = sagemaker.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

xgb-2024-05-23-16-27-19
arn:aws:sagemaker:us-east-1:652903355321:model/xgb-2024-05-23-16-27-19


In [16]:
model_data

's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2024-05-23-16-27-19/xgb-2024-05-23-16-27-19/output/model.tar.gz'

In [17]:
# Inspect Training Job Details
info

{'TrainingJobName': 'xgb-2024-05-23-16-27-19',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:652903355321:training-job/xgb-2024-05-23-16-27-19',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2024-05-23-16-27-19/xgb-2024-05-23-16-27-19/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'eta': '0.2',
  'gamma': '4',
  'max_depth': '5',
  'min_child_weight': '6',
  'num_round': '100',
  'objective': 'binary:logistic',
  'subsample': '0.8',
  'verbosity': '0'},
 'AlgorithmSpecification': {'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:mae',
    'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'validation:aucpr',
    'Regex': '.*\\[[0-9]+\\].*#011validation-aucpr:([-+]?[0-9]*\

# Create Endpoint Configuration


# Create an endpoint config name. Here we create one based on the date  
# so it we can search endpoints based on creation time.


In [18]:
endpoint_config_name = 'lab4-1-endpoint-config' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())                            
                            
instance_type = 'ml.m5.xlarge'

endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": instance_type, # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")

Created EndpointConfig: arn:aws:sagemaker:us-east-1:652903355321:endpoint-config/lab4-1-endpoint-config2024-05-23-16-38-09


In [19]:
# Deploy our model to real-time endpoint

endpoint_name = 'lab4-1-endpoint' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())                            


create_endpoint_response = sagemaker.create_endpoint(
                                            EndpointName=endpoint_name, 
                                            EndpointConfigName=endpoint_config_name) 

In [20]:
create_endpoint_response

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:652903355321:endpoint/lab4-1-endpoint2024-05-23-16-38-09',
 'ResponseMetadata': {'RequestId': '8069ad70-64b2-48f5-89de-b380f2bdac7a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8069ad70-64b2-48f5-89de-b380f2bdac7a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '102',
   'date': 'Thu, 23 May 2024 16:38:09 GMT'},
  'RetryAttempts': 0}}

In [22]:
# Wait for endpoint to spin up
import time

sagemaker.describe_endpoint(EndpointName=endpoint_name)

while True:
    print("Getting Job Status")
    res = sagemaker.describe_endpoint(EndpointName=endpoint_name)
    state = res["EndpointStatus"]
    
    if state == "InService":
        print("Endpoint in Service")
        break
    elif state == "Creating":
        print("Endpoint still creating...")
        time.sleep(60)
    else:
        print("Endpoint Creation Error - Check Sagemaker Console")
        break

Getting Job Status
Endpoint still creating...
Getting Job Status
Endpoint still creating...
Getting Job Status
Endpoint in Service


In [23]:
# Invoke Endpoint

sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region)

response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,
                            ContentType='text/csv',
                            Body=data_batch_noID.to_csv(header=None, index=False).strip('\n').split('\n')[0]
                            )
print(response['Body'].read().decode('utf-8'))

0.9918649792671204



## Part 1: Set Up Model Group
## The Model Group will contain a group of versioned models. Every time we update the model algorithm, input data, features, model hyperparameters, etc. we will add a new model package to this group.

In [24]:
import time
import os
from sagemaker import get_execution_role, session
import boto3
import time

#region = boto3.Session().region_name

#role = get_execution_role()

sm_client = boto3.client('sagemaker', region_name=region)


model_package_group_name = "breast-cancer-prediction-" + str(round(time.time()))
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "This model package group contains models for breast cancer prediction"
}

create_model_package_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))

ModelPackageGroup Arn : arn:aws:sagemaker:us-east-1:652903355321:model-package-group/breast-cancer-prediction-1716482482


In [25]:
response = sm_client.describe_model_package_group(
    ModelPackageGroupName=model_package_group_name
)

In [26]:
# Examine Response Body

response

{'ModelPackageGroupName': 'breast-cancer-prediction-1716482482',
 'ModelPackageGroupArn': 'arn:aws:sagemaker:us-east-1:652903355321:model-package-group/breast-cancer-prediction-1716482482',
 'ModelPackageGroupDescription': 'This model package group contains models for breast cancer prediction',
 'CreationTime': datetime.datetime(2024, 5, 23, 16, 41, 22, 149000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:652903355321:user-profile/d-mxzxda9simmz/pkhanna',
  'UserProfileName': 'pkhanna',
  'DomainId': 'd-mxzxda9simmz',
  'IamIdentity': {'Arn': 'arn:aws:sts::652903355321:assumed-role/LabRole/SageMaker',
   'PrincipalId': 'AROAZQBASSO4QOOKXIC4C:SageMaker'}},
 'ModelPackageGroupStatus': 'Completed',
 'ResponseMetadata': {'RequestId': 'b1302070-c96b-4174-9a99-5abd51021432',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b1302070-c96b-4174-9a99-5abd51021432',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '653',
   '

## Part 2: Set Up Model Package
## The Model Package will contain specific details about our current model. Our Model Package should document model deployment information (instance image, model data source i.e. our binary artifact, data source, any pre-processor or post-processor scripts, etc.). 

In [27]:
model_data

's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2024-05-23-16-27-19/xgb-2024-05-23-16-27-19/output/model.tar.gz'

In [28]:
model_url = model_data

modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": image,
	    "ModelDataUrl": model_url
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

# Alternatively, you can specify the model source like this:
# modelpackage_inference_specification["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]=model_url

create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Xgboost model to predict breast cancer",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

In [29]:
create_model_package_input_dict

{'ModelPackageGroupName': 'breast-cancer-prediction-1716482482',
 'ModelPackageDescription': 'Xgboost model to predict breast cancer',
 'ModelApprovalStatus': 'PendingManualApproval',
 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
    'ModelDataUrl': 's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2024-05-23-16-27-19/xgb-2024-05-23-16-27-19/output/model.tar.gz'}],
  'SupportedContentTypes': ['text/csv'],
  'SupportedResponseMIMETypes': ['text/csv']}}

In [30]:
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-1:652903355321:model-package/breast-cancer-prediction-1716482482/1


In [31]:
create_model_package_response

{'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:652903355321:model-package/breast-cancer-prediction-1716482482/1',
 'ResponseMetadata': {'RequestId': '6cee0299-8662-439a-a63b-84383aa9e312',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6cee0299-8662-439a-a63b-84383aa9e312',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '114',
   'date': 'Thu, 23 May 2024 16:41:22 GMT'},
  'RetryAttempts': 0}}

In [32]:
response = sm_client.describe_model_package(
    ModelPackageName=model_package_arn
)

In [33]:
response

{'ModelPackageGroupName': 'breast-cancer-prediction-1716482482',
 'ModelPackageVersion': 1,
 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:652903355321:model-package/breast-cancer-prediction-1716482482/1',
 'ModelPackageDescription': 'Xgboost model to predict breast cancer',
 'CreationTime': datetime.datetime(2024, 5, 23, 16, 41, 22, 920000, tzinfo=tzlocal()),
 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
    'ImageDigest': 'sha256:cf81520a3b695293022793e292cf8bc3732b79231a6ebe1fb308086f6163a875',
    'ModelDataUrl': 's3://sagemaker-us-east-1-652903355321/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2024-05-23-16-27-19/xgb-2024-05-23-16-27-19/output/model.tar.gz'}],
  'SupportedContentTypes': ['text/csv'],
  'SupportedResponseMIMETypes': ['text/csv']},
 'ModelPackageStatus': 'Completed',
 'ModelPackageStatusDetails': {'ValidationStatuses': [],
  'ImageScanStatuses': []},
 'CertifyForMarketpla

## Part 3: Write the Model Card
## Finally we have our Model Card. The Model Card will contain qualitative details about our current model. The Model Card can contain a lot of information. At a minimum, it should contain details of what the model algorithm is, how the model was trained, what hyperparameters were used to train the model, what the input features for the model are, who the model owner is (you), what problem the model is trying to solve, intended uses of the model, evaluation details of the model, and so on.

In [35]:
#Create a model card for your model in the SageMaker Model Registry
from sagemaker.model_card import (
    ModelCard,
    ModelPackage,
    ModelCardStatusEnum,
)
sagemaker_session = sess
mp_details = ModelPackage.from_model_package_arn(
    model_package_arn=model_package_arn,   
)

model_card_name = "breast-cancer-prediction-model-card"
my_card = ModelCard(
    name=model_card_name,
    status=ModelCardStatusEnum.APPROVED,
    model_package_details=mp_details,
    sagemaker_session=sagemaker_session,
)
my_card.create()

INFO:sagemaker.model_card.model_card:Evaluation details auto-discovery was unsuccessful. ModelMetrics was not found in the given model package. Please create one from scratch with EvaluationJob.
INFO:sagemaker.model_card.model_card:Creating model card with name: breast-cancer-prediction-model-card


'arn:aws:sagemaker:us-east-1:652903355321:model-card/breast-cancer-prediction-model-card'

In [36]:
response = sm_client.describe_model_card(
    ModelCardName=model_card_name,
    #ModelCardVersion=123
)

In [37]:
response

{'ModelCardArn': 'arn:aws:sagemaker:us-east-1:652903355321:model-card/breast-cancer-prediction-model-card',
 'ModelCardName': 'breast-cancer-prediction-model-card',
 'ModelCardVersion': 1,
 'Content': '{"training_details": {"training_job_details": {"training_arn": "arn:aws:sagemaker:us-east-1:652903355321:training-job/xgb-2024-05-23-16-27-19", "training_datasets": [], "training_environment": {"container_image": ["683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1"]}, "training_metrics": [{"name": "validation:logloss", "value": 0.07601000368595123}, {"name": "train:logloss", "value": 0.07249999791383743}], "user_provided_training_metrics": [], "hyper_parameters": [{"name": "eta", "value": "0.2"}, {"name": "gamma", "value": "4"}, {"name": "max_depth", "value": "5"}, {"name": "min_child_weight", "value": "6"}, {"name": "num_round", "value": "100"}, {"name": "objective", "value": "binary:logistic"}, {"name": "subsample", "value": "0.8"}, {"name": "verbosity", "value": "0"

In [38]:
# Delete Endpoint
#endpoint_name

sagemaker.delete_endpoint(EndpointName=endpoint_name)


{'ResponseMetadata': {'RequestId': '7bb3d06c-bada-4b46-a309-bbffaeaeba31',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7bb3d06c-bada-4b46-a309-bbffaeaeba31',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 23 May 2024 16:42:58 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}