In [25]:
import sys

In [64]:
import xgboost

In [26]:
# !{sys.executable} -m pip install sagemaker-experiments
# !{sys.executable} -m pip install "sagemaker-studio-image-build"

## Prerequistes

In [54]:
from sagemaker import get_execution_role

sagemaker_exeuctionRole = get_execution_role()

print(
    f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "codebuild:DeleteProject",
                "codebuild:CreateProject",
                "codebuild:BatchGetBuilds",
                "codebuild:StartBuild"
            ],
            "Resource": "arn:aws:codebuild:*:*:project/sagemaker-studio*"
        }},
        {{
            "Effect": "Allow",
            "Action": "logs:CreateLogStream",
            "Resource": "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*"
        }},
        {{
            "Effect": "Allow",
            "Action": [
                "logs:GetLogEvents",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*"
        }},
        {{
            "Effect": "Allow",
            "Action": "logs:CreateLogGroup",
            "Resource": "*"
        }},
        {{
            "Effect": "Allow",
            "Action": [
                "ecr:CreateRepository",
                "ecr:BatchGetImage",
                "ecr:CompleteLayerUpload",
                "ecr:DescribeImages",
                "ecr:DescribeRepositories",
                "ecr:UploadLayerPart",
                "ecr:ListImages",
                "ecr:InitiateLayerUpload",
                "ecr:BatchCheckLayerAvailability",
                "ecr:PutImage"
            ],
            "Resource": "arn:aws:ecr:*:*:repository/sagemaker-studio*"
        }},
        {{
            "Effect": "Allow",
            "Action": "ecr:GetAuthorizationToken",
            "Resource": "*"
        }},
        {{
            "Effect": "Allow",
            "Action": [
              "s3:GetObject",
              "s3:DeleteObject",
              "s3:PutObject"
              ],
            "Resource": "arn:aws:s3:::sagemaker-*/*"
        }},
        {{
            "Effect": "Allow",
            "Action": [
                "s3:CreateBucket"
            ],
            "Resource": "arn:aws:s3:::sagemaker*"
        }},
        {{
            "Effect": "Allow",
            "Action": [
                "iam:GetRole",
                "iam:ListRoles"
            ],
            "Resource": "*"
        }},
        {{
            "Effect": "Allow",
            "Action": "iam:PassRole",
            "Resource": "{sagemaker_exeuctionRole}",
            "Condition": {{
                "StringLikeIfExists": {{
                    "iam:PassedToService": "codebuild.amazonaws.com"
                }}
            }}
        }}
    ]
}}"""
)

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "codebuild:DeleteProject",
                "codebuild:CreateProject",
                "codebuild:BatchGetBuilds",
                "codebuild:StartBuild"
            ],
            "Resource": "arn:aws:codebuild:*:*:project/sagemaker-studio*"
        },
        {
            "Effect": "Allow",
            "Action": "logs:CreateLogStream",
            "Resource": "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "logs:GetLogEvents",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*"
        },
        {
            "Effect": "Allow",
            "Action": "logs:CreateLogGroup",
            "Resource": "*"
        },
        {
            "Effect": "Allow",
   

In [28]:
role

'arn:aws:iam::136605741915:role/SageMakerExeutionRole_Custom'

In [29]:
import boto3
import sagemaker

In [30]:
sm_sess = sagemaker.Session()
region = boto3.session.Session().region_name

# Boto clients provide a low-level interface to the AWS services
sageM = boto3.Session().client("sagemaker")

In [31]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from time import sleep, gmtime, strftime
import json
import time

In [32]:
# Import SageMaker Experiments
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

## S3 buckets and prefixes

In [33]:
rawbucket = sm_sess.default_bucket()

prefix = "sagemaker-modelmonitor"  # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + "/data"
traindataprefix = prefix + "/train_data"
testdataprefix = prefix + "/test_data"
testdatanolabelprefix = prefix + "/test_data_no_label"
trainheaderprefix = prefix + "/train_headers"

## Read raw data

In [128]:
data = pd.read_excel("default of credit card clients.xls", header=1)
data = data.drop(columns=["ID"])
data.rename(columns={"default payment next month": "Label"}, inplace=True)
lbl = data.Label
data = pd.concat([lbl, data.drop(columns=["Label"])], axis=1)

In [129]:
COLS = data.columns

In [35]:
if not os.path.exists('rawdata/rawdata.csv'):
    !mkdir rawdata
    data.to_csv('rawdata/rawdata.csv', index=None)
else:
    pass

In [36]:
# Upload the raw dataset
raw_data_location = sm_sess.upload_data("rawdata", bucket=rawbucket, key_prefix=dataprefix)
print(raw_data_location)

s3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/data


## SageMaker Preprocessing

### Runs a processing job using SKLearnProcessor class from the the SageMaker Python SDK

In [40]:
# Copy the preprocessing code over to the s3 bucket
codeprefix = prefix + "/code"
codeupload = sm_sess.upload_data("preprocessing.py", bucket=rawbucket, key_prefix=codeprefix)
print(codeupload)

s3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/code/preprocessing.py


In [138]:
train_data_location = rawbucket + "/" + traindataprefix
test_data_location = rawbucket + "/" + testdataprefix
print("Training data location = {}".format(train_data_location))
print("Test data location = {}".format(test_data_location))

Training data location = sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/train_data
Test data location = sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/test_data


In [139]:
## Use SageMaker Processing with Sk Learn. -- combine data into train and test at this stage if possible.
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version="0.20.0", role=role, instance_type="ml.c4.xlarge", instance_count=1)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [140]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code=codeupload,
    inputs=[ProcessingInput(source=raw_data_location, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/train",
            destination="s3://" + train_data_location,
        ),
        ProcessingOutput(
            output_name="test_data",
            source="/opt/ml/processing/test",
            destination="s3://" + test_data_location,
        ),
        ProcessingOutput(
            output_name="train_data_headers",
            source="/opt/ml/processing/train_headers",
            destination="s3://" + rawbucket + "/" + prefix + "/train_headers",
        ),
    ],
    arguments=["--train-test-split-ratio", "0.2"],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()


output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_test_data = output["S3Output"]["S3Uri"]

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2022-02-13-03-54-48-111



Job Name:  sagemaker-scikit-learn-2022-02-13-03-54-48-111
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/data', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/train_data', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sa

# Part 2: Building the container and training the model

## Image build 

In [156]:
!sm-docker build .

...[Container] 2022/02/13 04:12:33 Waiting for agent ping

[Container] 2022/02/13 04:12:34 Waiting for DOWNLOAD_SOURCE
[Container] 2022/02/13 04:12:37 Phase is DOWNLOAD_SOURCE
[Container] 2022/02/13 04:12:37 CODEBUILD_SRC_DIR=/codebuild/output/src014662302/src
[Container] 2022/02/13 04:12:37 YAML location is /codebuild/output/src014662302/src/buildspec.yml
[Container] 2022/02/13 04:12:37 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2022/02/13 04:12:37 Processing environment variables
[Container] 2022/02/13 04:12:37 No runtime version selected in buildspec.
[Container] 2022/02/13 04:12:37 Moving to directory /codebuild/output/src014662302/src
[Container] 2022/02/13 04:12:37 Configuring ssm agent with target id: codebuild:82476483-8606-47ed-a521-831cf37c09c6
[Container] 2022/02/13 04:12:37 Successfully updated ssm agent configuration
[Container] 2022/02/13 04:12:37 Registering with agent
[Container] 2022/02/13 04:12:37 Phases found in YAML: 3
[Container] 2022/0

## SageMaker Experiment

In [142]:
# Create a SageMaker Experiment

my_experiment = Experiment.create(experiment_name=f"CreditCardDefault-{int(time.time())}",
                                 description = "Predict credit card default from payments data",
                                 sagemaker_boto_client=sageM)

In [143]:
print(my_experiment)

Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fb9289660d0>,experiment_name='CreditCardDefault-1644724918',description='Predict credit card default from payments data',tags=None,experiment_arn='arn:aws:sagemaker:us-east-2:136605741915:experiment/creditcarddefault-1644724918',response_metadata={'RequestId': '56eba809-43a5-4a17-a7df-5a3ed4fd395e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '56eba809-43a5-4a17-a7df-5a3ed4fd395e', 'content-type': 'application/x-amz-json-1.1', 'content-length': '100', 'date': 'Sun, 13 Feb 2022 04:01:58 GMT'}, 'RetryAttempts': 0})


In [144]:
# Start Tracking parameters used in the Pre-processing pipeline.
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sageM) as tracker:
    tracker.log_parameters({"train_test_split_ratio": 0.2, "random_state": 0})
    
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="ccdefault-raw-dataset", media_type="s3/uri", value=raw_data_location)
    tracker.log_input(name="ccdefault-train-dataset", media_type="s3/uri", value=train_data_location)
    tracker.log_input(name="ccdefault-test-dataset", media_type="s3/uri", value=test_data_location)

In [145]:
account = sm_sess.boto_session.client("sts").get_caller_identity()["Account"]
ecr = boto3.client("ecr")
domain_id = "sagemaker-studio-{}".format(sageM.list_apps()["Apps"][0]["DomainId"])
image_tag = ecr.list_images(repositoryName=domain_id, filter={"tagStatus": "TAGGED"})["imageIds"][0]["imageTag"]

In [146]:
ecr.list_images(repositoryName=domain_id, filter={"tagStatus": "TAGGED"})["imageIds"]

[{'imageDigest': 'sha256:af5fcf446b5c79efdc67a71f20e9b4275d3c407664d4c6c76517d48ff9766b10',
  'imageTag': 'sagemaker-mlops-user'}]

In [147]:
"sagemaker-studio-{}".format(sageM.list_apps()["Apps"][0]["DomainId"])

'sagemaker-studio-d-c3dw70cqdizn'

In [148]:
python_training_inference_image = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format(account, region, domain_id, image_tag)
python_training_inference_image

'136605741915.dkr.ecr.us-east-2.amazonaws.com/sagemaker-studio-d-c3dw70cqdizn:sagemaker-mlops-user'

In [149]:
tracker.trial_component

TrialComponent(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fb9289660d0>,trial_component_name='TrialComponent-2022-02-13-040158-cxpz',display_name='Preprocessing',tags=None,trial_component_arn='arn:aws:sagemaker:us-east-2:136605741915:experiment-trial-component/trialcomponent-2022-02-13-040158-cxpz',response_metadata={'RequestId': 'dc97f09f-35a7-4345-b3bd-c696fb4d215f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'dc97f09f-35a7-4345-b3bd-c696fb4d215f', 'content-type': 'application/x-amz-json-1.1', 'content-length': '129', 'date': 'Sun, 13 Feb 2022 04:01:58 GMT'}, 'RetryAttempts': 0},parameters={'train_test_split_ratio': 0.2, 'random_state': 0},input_artifacts={'ccdefault-raw-dataset': TrialComponentArtifact(value='s3://sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/data',media_type='s3/uri'), 'ccdefault-train-dataset': TrialComponentArtifact(value='sagemaker-us-east-2-136605741915/sagemaker-modelmonitor/train_data',media_type='s3/uri'), 'ccdefau

In [150]:
preprocessing_trial_component = tracker.trial_component

trial_name = f"cc-fraud-training-job-{int(time.time())}"

cc_trial = Trial.create(trial_name=trial_name, experiment_name=my_experiment.experiment_name, sagemaker_boto_client=sageM)

cc_trial.add_trial_component(preprocessing_trial_component)
cc_training_job_name = "cc-training-job-{}".format(int(time.time()))

In [151]:
xgbEstimator = sagemaker.estimator.Estimator(
    python_training_inference_image,
    sagemaker_exeuctionRole,
    instance_count=1,
    instance_type = "ml.m4.xlarge",
    max_run=86400,
    output_path="s3://{}/{}/models".format(rawbucket, prefix),
    sagemaker_sess=sm_sess)

In [152]:
xgbHyperparameters = xgbEstimator.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100)

In [153]:
xgbEstimator.fit(
    inputs={"training": "s3://" + train_data_location},
    job_name=cc_training_job_name,
    experiment_config={
        "TrialName": cc_trial.trial_name,
        "TrialComponentDisplayName": "Training",},
    wait=True,
)
time.sleep(2)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: cc-training-job-1644724919


2022-02-13 04:01:59 Starting - Starting the training job...
2022-02-13 04:02:02 Starting - Launching requested ML instancesProfilerReport-1644724919: InProgress
.

KeyboardInterrupt: 

In [None]:
train_data_location

In [None]:
# Present the Model Lineage as a dataframe
from sagemaker.session import Session

session = boto3.Session()
lineage_table = ExperimentAnalytics(
    sagemaker_session=Session(session, sageM),
    search_expression={
        "Filters": [{"Name": "Parents.TrialName", "Operator": "Equals", "Value": trial_name}]
    },
    sort_by="CreationTime",
    sort_order="Ascending",
)
lineagedf = lineage_table.dataframe()

lineagedf

In [None]:
# get detailed information about a particular trial
sageM.describe_trial_component(TrialComponentName=lineagedf.TrialComponentName[1])

# Inference using the trained model

In [158]:
s3 = boto3.client("s3")

In [159]:
s3.download_file(rawbucket, testdataprefix + "/test_data.csv", "test_data.csv")

In [160]:
newcolorder = (["PAY_AMT1", "BILL_AMT1"] + list(COLS[1:])[:11] + list(COLS[1:])[12:17] + list(COLS[1:])[18:])
test_full = pd.read_csv("test_data.csv", names=["Label"] + newcolorder)
test_full.head()

Unnamed: 0,Label,PAY_AMT1,BILL_AMT1,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,...,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0,-0.341476,0.201175,20000.0,1.0,1.0,2.0,33.0,1.0,2.0,...,17399.0,19057.0,18453.0,19755.0,19288.0,2260.0,0.0,1600.0,0.0,644.0
1,0,-0.136859,0.199594,20000.0,2.0,2.0,2.0,35.0,0.0,0.0,...,19347.0,18600.0,19000.0,19000.0,20000.0,0.0,1000.0,0.0,1000.0,0.0
2,0,-0.284364,0.185736,230000.0,2.0,1.0,1.0,44.0,1.0,-1.0,...,949.0,2864.0,933.0,0.0,0.0,2873.0,933.0,0.0,0.0,0.0
3,0,-0.040569,0.28936,100000.0,1.0,2.0,1.0,42.0,0.0,0.0,...,99998.0,16138.0,17758.0,18774.0,20272.0,2000.0,2000.0,2000.0,2000.0,2000.0
4,0,0.079132,0.186502,150000.0,1.0,1.0,2.0,29.0,-2.0,-2.0,...,6917.0,831.0,6469.0,5138.0,7810.0,833.0,6488.0,5153.0,7833.0,7130.0


In [161]:
test_data_no_label = test_full.drop(columns=["Label"], axis=1)
label = test_full["Label"]
test_data_no_label.to_csv("test_data_no_label.csv", index=False, header=False)
test_data_no_label.shape

(6000, 23)

In [162]:
test_data_nohead_location = sm_sess.upload_data("test_data_no_label.csv", bucket=rawbucket, key_prefix=testdatanolabelprefix)

In [163]:
%%time

sm_transformer = xgbEstimator.transformer(1, "ml.m5.xlarge", accept="text/csv")

# start a transform job
sm_transformer.transform(test_data_nohead_location, split_type="Line", content_type="text/csv")
sm_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-studio-d-c3dw70cqdizn-2022-02-13-04-15-18-637
INFO:sagemaker:Creating transform job with name: sagemaker-studio-d-c3dw70cqdizn-2022-02-13-04-15-18-924


.......................[34mStarting the inference server with 4 workers.[0m
[34m[2022-02-13 04:18:53 +0000] [11] [INFO] Starting gunicorn 20.1.0[0m
[34m[2022-02-13 04:18:53 +0000] [11] [INFO] Listening at: unix:/tmp/gunicorn.sock (11)[0m
[34m[2022-02-13 04:18:53 +0000] [11] [INFO] Using worker: gevent[0m
[34m[2022-02-13 04:18:53 +0000] [15] [INFO] Booting worker with pid: 15[0m
[34m[2022-02-13 04:18:53 +0000] [16] [INFO] Booting worker with pid: 16[0m
[34m[2022-02-13 04:18:53 +0000] [17] [INFO] Booting worker with pid: 17[0m
[34m[2022-02-13 04:18:53 +0000] [18] [INFO] Booting worker with pid: 18[0m
[35mStarting the inference server with 4 workers.[0m
[35m[2022-02-13 04:18:53 +0000] [11] [INFO] Starting gunicorn 20.1.0[0m
[35m[2022-02-13 04:18:53 +0000] [11] [INFO] Listening at: unix:/tmp/gunicorn.sock (11)[0m
[35m[2022-02-13 04:18:53 +0000] [11] [INFO] Using worker: gevent[0m
[35m[2022-02-13 04:18:53 +0000] [15] [INFO] Booting worker with pid: 15[0m
[35m[2022

In [164]:
import json
import io
from urllib.parse import urlparse


def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource("s3")
    obj = s3.Object(bucket_name, "{}/{}".format(prefix, file_name))
    return obj.get()["Body"].read().decode("utf-8")

In [165]:
output = get_csv_output_from_s3(sm_transformer.output_path, "test_data_no_label.csv.out")
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
5,0
6,0
7,0


In [166]:
from sklearn.metrics import confusion_matrix, accuracy_score


In [167]:
1 - np.unique(data["Label"], return_counts=True)[1][1] / (len(data["Label"]))

0.7787999999999999

In [168]:
print(
    "Baseline Accuracy = {}".format(
        1 - np.unique(data["Label"], return_counts=True)[1][1] / (len(data["Label"]))
    )
)
print("Accuracy Score = {}".format(accuracy_score(label, output_df)))

Baseline Accuracy = 0.7787999999999999
Accuracy Score = 0.826


In [169]:
output_df["Predicted"] = output_df.values
output_df["Label"] = label
confusion_matrix = pd.crosstab(
    output_df["Predicted"],
    output_df["Label"],
    rownames=["Actual"],
    colnames=["Predicted"],
    margins=True,
)
confusion_matrix

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4472,813,5285
1,231,484,715
All,4703,1297,6000
