In [39]:
import yaml
import sagemaker
import boto3
import pandas as pd
import os
import json
import time
from sklearn.model_selection import train_test_split
from time import gmtime, strftime
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.session import production_variant
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import Session


In [41]:
SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "../avazu-ctr-prediction"

# AWS リソース設定
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()


role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')


In [42]:
# train, validation, test データを用意
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")
df_train, df_test = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)


In [43]:

# S3にアップロード
prefix = 'model-monitoring'

train_file = "train.csv"
test_file = "test.csv"

df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)

s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file(train_file)
s3_resource_bucket.Object(os.path.join(prefix, "test", test_file)).upload_file(test_file)


In [44]:
output_location = f"s3://{bucket}/{prefix}/output"

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"

In [329]:
timestamp = strftime("%Y%m%d-%H-%M-%S", gmtime())
job_name = "model-training-job" + timestamp

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training = False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "trainer.py",
    "source_dir": "model",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

model_estimator = SKLearn(**estimator_parameters)
model_estimator.fit(inputs)


2022-11-24 10:38:19 Starting - Starting the training job...
2022-11-24 10:38:41 Starting - Preparing the instances for trainingProfilerReport-1669286298: InProgress
.........
2022-11-24 10:40:20 Downloading - Downloading input data...
2022-11-24 10:40:46 Training - Training image download completed. Training in progress..2022-11-24 10:40:50,944 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2022-11-24 10:40:50,947 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-24 10:40:50,956 sagemaker_sklearn_container.training INFO     Invoking user training script.
2022-11-24 10:40:51,191 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-24 10:40:51,203 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-24 10:40:51,215 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-24 10:40:51,224 sagemaker-trai

In [349]:
    
timestamp = strftime("%Y%m%d-%H-%M-%S", gmtime())
    
model = SKLearnModel(
    role=role,
    model_data=model_estimator.model_data,
    framework_version="0.23-1",
    py_version="py3",
    source_dir="model",
    entry_point="inference.py",
    sagemaker_session=sess
)

model_name = "{}-{}".format("model", timestamp)


sess.create_model(
    model_name,
    role,
    model.prepare_container_def(
        instance_type='ml.t2.medium'
    )
)

'model-20221124-10-57-38'

In [350]:
from sagemaker.model_monitor import DataCaptureConfig

endpoint_name = 'model-monitor-endopoint' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

data_capture_prefix = "{}/datacapture".format(prefix)
s3_capture_upload_path = "s3://{}/{}".format(bucket, data_capture_prefix)

# データキャプチャーの設定
data_capture_config = DataCaptureConfig(
                        enable_capture = True,
                        sampling_percentage=50,
                        destination_s3_uri=s3_capture_upload_path,
                        kms_key_id=None,
                        capture_options=["REQUEST"],
                        csv_content_types=["text/csv"],
                        json_content_types=["application/json"]
                        )

predictor = model.deploy(initial_instance_count=1,
                instance_type="ml.t2.medium",
                endpoint_name=endpoint_name,
                data_capture_config=data_capture_config)

EndpointName=model-monitor-endopoint2022-11-24-10-57-49
--------!

In [351]:
from sagemaker.predictor import Predictor
import time

predictor = Predictor(endpoint_name=endpoint_name)

In [389]:
runtime = boto3.Session().client('sagemaker-runtime')
model_list = []
prediction_list = []

with open('test_partial.csv') as f:
    for i, line in enumerate(f):
        response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/csv', 
                                   Body=line,
                                  Accept='application/json')
        if i == 5000:
            break

In [359]:
response_dict = json.loads(response['Body'].read().decode("utf-8"))
print(json.dumps(response_dict, indent=4))

[
    [
        0.9973229030888725,
        0.0026770969111275357
    ]
]


In [360]:
# fileチェック
s3_client = boto3.Session().client('s3')
current_endpoint_capture_prefix = '{}/{}'.format(data_capture_prefix, endpoint_name)
result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)
capture_files = [capture_file.get("Key") for capture_file in result.get('Contents')]
print("Found Capture Files:")
print("\n ".join(capture_files))


Found Capture Files:
model-monitoring/datacapture/model-monitor-endopoint2022-11-24-10-57-49/AllTraffic/2022/11/24/11/07-53-227-45de1831-ba8c-4224-9f82-39fc4b79f264.jsonl


In [361]:
def get_obj_body(obj_key):
    return s3_client.get_object(Bucket=bucket, Key=obj_key).get("Body").read().decode("utf-8")


capture_file = get_obj_body(capture_files[-1])
print(capture_file[:2000])

{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"1.0807444605793397e+19,0,14102405,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,66a9be9c,684581ce,1,0,21981,320,50,2532,0,679,100077,48\n","encoding":"CSV"}},"eventMetadata":{"eventId":"4a8a5ed9-6393-432c-8295-35b272ade59f","inferenceTime":"2022-11-24T11:07:53Z"},"eventVersion":"0"}
{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"1.1031019631453975e+19,0,14102319,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,ffaf9bf0,d787e91b,1,0,20128,320,50,2303,2,39,100188,23\n","encoding":"CSV"}},"eventMetadata":{"eventId":"4a74b9e4-b0da-4e7d-bc4e-c3bc9c3a8403","inferenceTime":"2022-11-24T11:07:53Z"},"eventVersion":"0"}
{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"1.2982952669462444e+19,0,14102305,1002,0,0adb684a,270e16bc,50e219e0,ecad2386,7801e8d9,07d7df22,8ef2151d,852fa91e,61c1b

In [362]:
import json

print(json.dumps(json.loads(capture_file.split("\n")[0]), indent=2))

{
  "captureData": {
    "endpointInput": {
      "observedContentType": "text/csv",
      "mode": "INPUT",
      "data": "1.0807444605793397e+19,0,14102405,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,66a9be9c,684581ce,1,0,21981,320,50,2532,0,679,100077,48\n",
      "encoding": "CSV"
    }
  },
  "eventMetadata": {
    "eventId": "4a8a5ed9-6393-432c-8295-35b272ade59f",
    "inferenceTime": "2022-11-24T11:07:53Z"
  },
  "eventVersion": "0"
}


In [363]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

my_default_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)
baseline_results_uri = f"s3://{bucket}/{prefix}/model_monitor/results"
my_default_monitor.suggest_baseline(
    baseline_dataset=s3_train_data,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    wait=True
)


Job Name:  baseline-suggestion-job-2022-11-24-11-12-57-404
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/model-monitoring/train/train.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/model-monitoring/model_monitor/results', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..............................2022-11-24 11:17:46,364 - matplotlib.font_manager - INFO - Generating new fontManager, this may take some time...
2022-11-24 11:17:47.669701: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-

<sagemaker.processing.ProcessingJob at 0x11fbfd1f0>

In [385]:

baseline_job = my_default_monitor.latest_baselining_job
schema_df = pd.json_normalize(baseline_job.baseline_statistics().body_dict["features"])

schema_df.head(10)


Unnamed: 0,name,inferred_type,string_statistics.common.num_present,string_statistics.common.num_missing,string_statistics.distinct_count,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data,string_statistics.distribution.categorical.buckets
0,id,String,323432.0,0.0,329471.0,,,,,,,,,,,,
1,click,Integral,,,,323432.0,0.0,0.1686784,54556.0,0.374468,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[], [], [], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",
2,hour,Integral,,,,323432.0,0.0,14102560.0,4561219000000.0,296.652135,14102100.0,14103023.0,"[{'lower_bound': 14102100.0, 'upper_bound': 14...",0.64,2048.0,"[[], [], [], [14103023.0, 14102100.0, 14102100...",
3,C1,Integral,,,,323432.0,0.0,1004.969,325039100.0,1.098521,1001.0,1012.0,"[{'lower_bound': 1001.0, 'upper_bound': 1002.1...",0.64,2048.0,"[[], [], [], [1012.0, 1002.0, 1002.0, 1002.0, ...",
4,banner_pos,Integral,,,,323432.0,0.0,0.2886356,93354.0,0.506276,0.0,7.0,"[{'lower_bound': 0.0, 'upper_bound': 0.7, 'cou...",0.64,2048.0,"[[], [], [], [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",
5,site_id,String,323432.0,0.0,1994.0,,,,,,,,,,,,
6,site_domain,String,323432.0,0.0,2088.0,,,,,,,,,,,,
7,site_category,String,323432.0,0.0,20.0,,,,,,,,,,,,"[{'value': 'c0dd3be3', 'count': 331}, {'value'..."
8,app_id,String,323432.0,0.0,2080.0,,,,,,,,,,,,
9,app_domain,String,323432.0,0.0,124.0,,,,,,,,,,,,


In [368]:
kll_buckets = schema_df[schema_df.name == 'hour']['numerical_statistics.distribution.kll.buckets'].tolist()[0]
for kll_bucket in kll_buckets:
    print(kll_bucket)

{'lower_bound': 14102100.0, 'upper_bound': 14102192.3, 'count': 33232.0}
{'lower_bound': 14102192.3, 'upper_bound': 14102284.6, 'count': 42312.0}
{'lower_bound': 14102284.6, 'upper_bound': 14102376.9, 'count': 31112.0}
{'lower_bound': 14102376.9, 'upper_bound': 14102469.2, 'count': 26464.0}
{'lower_bound': 14102469.2, 'upper_bound': 14102561.5, 'count': 27056.0}
{'lower_bound': 14102561.5, 'upper_bound': 14102653.8, 'count': 30800.0}
{'lower_bound': 14102653.8, 'upper_bound': 14102746.1, 'count': 25736.0}
{'lower_bound': 14102746.1, 'upper_bound': 14102838.4, 'count': 42344.0}
{'lower_bound': 14102838.4, 'upper_bound': 14102930.7, 'count': 30680.0}
{'lower_bound': 14102930.7, 'upper_bound': 14103023.0, 'count': 33696.0}


In [369]:
constraints_df = pd.json_normalize(baseline_job.suggested_constraints().body_dict["features"])
constraints_df.head(10)

Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative,string_constraints.domains
0,id,String,1.0,,
1,click,Integral,1.0,True,
2,hour,Integral,1.0,True,
3,C1,Integral,1.0,True,
4,banner_pos,Integral,1.0,True,
5,site_id,String,1.0,,
6,site_domain,String,1.0,,
7,site_category,String,1.0,,"[c0dd3be3, 28905ebd, 70fb0e29, a818d37a, 76b29..."
8,app_id,String,1.0,,
9,app_domain,String,1.0,,


In [384]:
from sagemaker.model_monitor import CronExpressionGenerator

monitor_schedule_name = 'ctr-prediction-monitoring'
s3_report_path = f's3://{bucket}/model_monitor/monitoring_report'
my_default_monitor.create_monitoring_schedule(
    monitor_schedule_name=monitor_schedule_name,
    endpoint_input=predictor.endpoint,
    output_s3_uri=s3_report_path,
    statistics=my_default_monitor.baseline_statistics(),
    constraints=my_default_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
It seems that this object was already used to create an Amazon Model Monitoring Schedule. To create another, first delete the existing one using my_monitor.delete_monitoring_schedule().


ValueError: It seems that this object was already used to create an Amazon Model Monitoring Schedule. To create another, first delete the existing one using my_monitor.delete_monitoring_schedule().

In [386]:
desc_schedule_result = my_default_monitor.describe_schedule()
print("Schedule status: {}".format(desc_schedule_result["MonitoringScheduleStatus"]))


Schedule status: Scheduled


In [388]:
client = boto3.client('sagemaker', region_name=region)


mon_executions = client.list_monitoring_executions(MonitoringScheduleName=monitor_schedule_name, MaxResults=1)


for execution_summary in mon_executions['MonitoringExecutionSummaries']:
    job_name = execution_summary['ProcessingJobArn'].split('/')[1]
    
    print("ProcessingJob: {}".format(execution_summary['ProcessingJobArn'].split('/')[1]))
    
    print('MonitoringExecutionStatus: {} \n'.format(execution_summary['MonitoringExecutionStatus']))

    desc_analytics_job_result=client.describe_processing_job(ProcessingJobName=job_name)
    report_uri=desc_analytics_job_result['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
print('Report Uri: {}'.format(report_uri))

ProcessingJob: model-monitoring-202211241200-b02885d6b5025eacaa8e2c97
MonitoringExecutionStatus: InProgress 

Report Uri: s3://ctr-prediction/model_monitor/monitoring_report/model-monitor-endopoint2022-11-24-10-57-49/ctr-prediction-monitoring/2022/11/24/12


In [289]:
!wget https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/sagemaker_model_monitor/visualization/utils.py
!pip install Jinja2

import os
from IPython.display import HTML, display
from sagemaker.s3 import S3Downloader
from sagemaker.model_monitor import MonitoringExecution
import utils as mu

--2022-11-24 13:19:42--  https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/sagemaker_model_monitor/visualization/utils.py
raw.githubusercontent.com (raw.githubusercontent.com) をDNSに問いあわせています... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8001::154, ...
raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443 に接続しています... 接続しました。
HTTP による接続要求を送信しました、応答を待っています... 200 OK
長さ: 13024 (13K) [text/plain]
`utils.py.1' に保存中


2022-11-24 13:19:42 (29.9 MB/s) - `utils.py.1' へ保存完了 [13024/13024]



In [303]:
mon_executions = client.list_monitoring_executions(MonitoringScheduleName=monitor_schedule_name, MaxResults=1)

for execution_summary in mon_executions['MonitoringExecutionSummaries']:
    processing_job_arn = execution_summary['ProcessingJobArn']

    execution = MonitoringExecution.from_processing_arn(sagemaker_session=sess, processing_job_arn=processing_job_arn)
    exec_inputs = {inp['InputName']: inp for inp in execution.describe()['ProcessingInputs']}
    exec_results = execution.output.destination

    baseline_statistics_filepath = exec_inputs['baseline']['S3Input']['S3Uri'] if 'baseline' in exec_inputs else None
    execution_statistics_filepath = os.path.join(exec_results, 'statistics.json')
    violations_filepath = os.path.join(exec_results, 'constraint_violations.json')

    baseline_statistics = json.loads(S3Downloader.read_file(baseline_statistics_filepath)) if baseline_statistics_filepath is not None else None
    print(execution_statistics_filepath)
    execution_statistics = json.loads(S3Downloader.read_file(execution_statistics_filepath))
    violations = json.loads(S3Downloader.read_file(violations_filepath))['violations']

mu.show_violation_df(baseline_statistics=baseline_statistics, latest_statistics=execution_statistics, violations=violations)

s3://ctr-prediction/model_monitor/monitoring_report/model-monitor-endopoint2022-11-24-02-57-45/ctr-prediction-monitoring/2022/11/24/05/statistics.json


Unnamed: 0,data_type,completeness,baseline_drift,categorical_values
C1,String,100.00%,,
C14,String,100.00%,,
C15,String,100.00%,,
C16,String,100.00%,,
C17,String,100.00%,,
C18,String,100.00%,,
C19,String,100.00%,,
C20,String,100.00%,,
C21,String,100.00%,,
app_category,String,100.00%,,


In [376]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=sess
)

In [377]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix)
bias_data_config = clarify.DataConfig(
    s3_data_input_path=s3_train_data,
    s3_output_path=bias_report_output_path,
    label="click",
    headers=df_train.columns.to_list(),
    dataset_type="text/csv",
)

In [378]:
model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv"
)

In [379]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.5)


In [382]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], facet_name="banner_pos", facet_values_or_threshold=[0], group_name="C1"
)

In [383]:
clarify_processor.run_bias(
    data_config=bias_data_config,
    bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=predictions_config,
    pre_training_methods="all",
    post_training_methods="all",
)


Job Name:  Clarify-Bias-2022-11-24-11-54-13-841
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/model-monitoring/train/train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/model-monitoring/clarify-bias/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/model-monitoring/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..............................2022-11-24 11:59:09,266 logging.conf not found when configuring logging, using def

In [None]:
shap_config = clarify.SHAPConfig(baseline=[test_features.iloc[0].values.tolist()],
                                 num_samples=15,
                                 agg_method='mean_abs')

explainability_output_path = 's3://{}/{}/clarify-explainability'.format(bucket, prefix)
explainability_data_config = clarify.DataConfig(s3_data_input_path=train_uri,
                                s3_output_path=explainability_output_path,
                                label='Target',
                                headers=training_data.columns.to_list(),
                                dataset_type='text/csv')

In [None]:
clarify_processor.run_explainability(data_config=explainability_data_config,
                                     model_config=model_config,
                                     explainability_config=shap_config)

In [None]:


df_test.iloc[0].values.tolist()