In [39]:
import yaml
import sagemaker
import boto3
import pandas as pd
import os
import json
import time
from sklearn.model_selection import train_test_split
from time import gmtime, strftime
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.session import production_variant
from sagemaker.sklearn.model import SKLearnModel

In [41]:
SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "../avazu-ctr-prediction"

# AWS リソース設定
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()


role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')


In [42]:
# train, validation, test データを用意
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")
df_train, df_test = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)


In [43]:

# S3にアップロード
prefix = 'model-monitoring'

train_file = "train.csv"
test_file = "test.csv"

df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)

s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file(train_file)
s3_resource_bucket.Object(os.path.join(prefix, "test", test_file)).upload_file(test_file)


In [44]:
output_location = f"s3://{bucket}/{prefix}/output"

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"

In [45]:

timestamp = strftime("%Y%m%d-%H-%M-%S", gmtime())
job_name = "model-training-job" + timestamp

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training = False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "trainer.py",
    "source_dir": "model",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

model_estimator = SKLearn(**estimator_parameters)
model_estimator.fit(inputs)


2022-11-23 06:16:33 Starting - Starting the training job...
2022-11-23 06:16:56 Starting - Preparing the instances for trainingProfilerReport-1669184192: InProgress
.........
2022-11-23 06:18:16 Downloading - Downloading input data...
2022-11-23 06:18:57 Training - Training image download completed. Training in progress.2022-11-23 06:18:56,623 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2022-11-23 06:18:56,626 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-23 06:18:56,634 sagemaker_sklearn_container.training INFO     Invoking user training script.
2022-11-23 06:18:56,841 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-23 06:18:56,852 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-23 06:18:56,863 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2022-11-23 06:18:56,874 sagemaker-train

In [55]:
model = SKLearnModel(
    role=role,
    model_data=model_estimator.model_data,
    framework_version="0.23-1",
    py_version="py3",
    source_dir="model",
    entry_point="inference.py",
    sagemaker_session=sess
)

model_name = "{}-{}".format("model", timestamp)


sess.create_model(
    model_name,
    role,
    model.prepare_container_def(
        instance_type='ml.t2.medium'
    )
)

'model-1669184422'

In [118]:
from sagemaker.model_monitor import DataCaptureConfig

endpoint_name = 'model-monitor-endopoint' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

data_capture_prefix = "{}/datacapture".format(prefix)
s3_capture_upload_path = "s3://{}/{}".format(bucket, data_capture_prefix)

# データキャプチャーの設定
aata_capture_config = DataCaptureConfig(
                        enable_capture = True,
                        sampling_percentage=50,
                        destination_s3_uri=s3_capture_upload_path,
                        kms_key_id=None,
                        capture_options=["REQUEST", "RESPONSE"],
                        csv_content_types=["text/csv"],)

predictor = model.deploy(initial_instance_count=1,
                instance_type="ml.t2.medium",
                endpoint_name=endpoint_name,
                serializer=sagemaker.serializers.CSVSerializer(),
                deserializer=sagemaker.deserializers.CSVDeserializer(accept="application/json"),
                data_capture_config=data_capture_config)

EndpointName=model-monitor-endopoint2022-11-23-11-41-53
----------!

In [119]:
endpoint_name

'model-monitor-endopoint2022-11-23-11-41-53'

In [120]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
import time

predictor = Predictor(endpoint_name=endpoint_name, serializer=CSVSerializer())

In [121]:
with open('test_partial.csv') as f:
    for i, row in enumerate(f):
        if i == 0:
            continue
        # if i == 30:
            # break
        # payload = row.rstrip('\n')
        payload = row.rstrip(',')
        response = predictor.predict(data=payload)
        # time.sleep(0.5)


KeyboardInterrupt: 

In [122]:
# fileチェック
s3_client = boto3.Session().client('s3')
current_endpoint_capture_prefix = '{}/{}'.format(data_capture_prefix, endpoint_name)
result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)
capture_files = [capture_file.get("Key") for capture_file in result.get('Contents')]
print("Found Capture Files:")
print("\n ".join(capture_files))


Found Capture Files:
model-monitoring/datacapture/model-monitor-endopoint2022-11-23-11-41-53/AllTraffic/2022/11/23/11/47-55-236-baa174e1-1950-4509-9d59-e8a79644c9a2.jsonl
 model-monitoring/datacapture/model-monitor-endopoint2022-11-23-11-41-53/AllTraffic/2022/11/23/11/48-55-254-46b01ca8-ee01-43c1-95ca-958ca99c3e33.jsonl
 model-monitoring/datacapture/model-monitor-endopoint2022-11-23-11-41-53/AllTraffic/2022/11/23/11/49-55-274-cb419a3c-73b1-4d96-b57e-c7d980b0b5fd.jsonl
 model-monitoring/datacapture/model-monitor-endopoint2022-11-23-11-41-53/AllTraffic/2022/11/23/11/50-55-281-4bd355ea-71da-4642-8e1c-8b2e51057d01.jsonl


In [123]:
def get_obj_body(obj_key):
    return s3_client.get_object(Bucket=bucket, Key=obj_key).get("Body").read().decode("utf-8")


capture_file = get_obj_body(capture_files[-1])
print(capture_file[:2000])

{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"3.95452645865998e+18,0,14102120,1005,1,e8f79e60,c4342784,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,fe0f7ad9,1617d758,1,0,20312,320,50,1780,0,1711,100077,48\n","encoding":"CSV"},"endpointOutput":{"observedContentType":"application/json","mode":"OUTPUT","data":"{\"result\":[{\"model\":\"modelA\",\"prediction\":0.0027994161195120117}]}\n","encoding":"JSON"}},"eventMetadata":{"eventId":"afb863e6-aeb9-4dd5-ac5a-0ec74c98df08","inferenceTime":"2022-11-23T11:50:55Z"},"eventVersion":"0"}
{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"3.268118451555648e+18,0,14102315,1005,0,85f751fd,c4e18dd6,50e219e0,d36838b1,0e8616ad,0f2161f8,a99f214a,fae31f99,1f0bc64f,1,0,454,320,50,122,3,1327,-1,15\n","encoding":"CSV"},"endpointOutput":{"observedContentType":"application/json","mode":"OUTPUT","data":"{\"result\":[{\"model\":\"modelA\",\"prediction\":0.0013291389082087726}]}\n",

In [124]:
import json

print(json.dumps(json.loads(capture_file.split("\n")[0]), indent=2))

{
  "captureData": {
    "endpointInput": {
      "observedContentType": "text/csv",
      "mode": "INPUT",
      "data": "3.95452645865998e+18,0,14102120,1005,1,e8f79e60,c4342784,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,fe0f7ad9,1617d758,1,0,20312,320,50,1780,0,1711,100077,48\n",
      "encoding": "CSV"
    },
    "endpointOutput": {
      "observedContentType": "application/json",
      "mode": "OUTPUT",
      "data": "{\"result\":[{\"model\":\"modelA\",\"prediction\":0.0027994161195120117}]}\n",
      "encoding": "JSON"
    }
  },
  "eventMetadata": {
    "eventId": "afb863e6-aeb9-4dd5-ac5a-0ec74c98df08",
    "inferenceTime": "2022-11-23T11:50:55Z"
  },
  "eventVersion": "0"
}


In [125]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

my_default_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)
baseline_results_uri = f"s3://{bucket}/{prefix}/model_monitor/results"
my_default_monitor.suggest_baseline(
    baseline_dataset=s3_train_data,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    wait=True
)


Job Name:  baseline-suggestion-job-2022-11-23-11-53-03-276
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/model-monitoring/train/train.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/model-monitoring/model_monitor/results', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..........................

KeyboardInterrupt: 

In [95]:

baseline_job = my_default_monitor.latest_baselining_job
schema_df = pd.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
schema_df.head(10)


Unnamed: 0,name,inferred_type,string_statistics.common.num_present,string_statistics.common.num_missing,string_statistics.distinct_count,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data,string_statistics.distribution.categorical.buckets
0,id,String,323432.0,0.0,329471.0,,,,,,,,,,,,
1,click,Integral,,,,323432.0,0.0,0.1686784,54556.0,0.374468,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[], [], [], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",
2,hour,Integral,,,,323432.0,0.0,14102560.0,4561219000000.0,296.652135,14102100.0,14103023.0,"[{'lower_bound': 14102100.0, 'upper_bound': 14...",0.64,2048.0,"[[], [], [], [14103023.0, 14102100.0, 14102100...",
3,C1,Integral,,,,323432.0,0.0,1004.969,325039100.0,1.098521,1001.0,1012.0,"[{'lower_bound': 1001.0, 'upper_bound': 1002.1...",0.64,2048.0,"[[], [], [], [1012.0, 1002.0, 1002.0, 1002.0, ...",
4,banner_pos,Integral,,,,323432.0,0.0,0.2886356,93354.0,0.506276,0.0,7.0,"[{'lower_bound': 0.0, 'upper_bound': 0.7, 'cou...",0.64,2048.0,"[[], [], [], [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",
5,site_id,String,323432.0,0.0,1994.0,,,,,,,,,,,,
6,site_domain,String,323432.0,0.0,2088.0,,,,,,,,,,,,
7,site_category,String,323432.0,0.0,20.0,,,,,,,,,,,,"[{'value': 'c0dd3be3', 'count': 331}, {'value'..."
8,app_id,String,323432.0,0.0,2080.0,,,,,,,,,,,,
9,app_domain,String,323432.0,0.0,124.0,,,,,,,,,,,,


In [105]:
kll_buckets = schema_df[schema_df.name == 'hour']['numerical_statistics.distribution.kll.buckets'].tolist()[0]
for kll_bucket in kll_buckets:
    print(kll_bucket)

{'lower_bound': 14102100.0, 'upper_bound': 14102192.3, 'count': 33232.0}
{'lower_bound': 14102192.3, 'upper_bound': 14102284.6, 'count': 42312.0}
{'lower_bound': 14102284.6, 'upper_bound': 14102376.9, 'count': 31112.0}
{'lower_bound': 14102376.9, 'upper_bound': 14102469.2, 'count': 26464.0}
{'lower_bound': 14102469.2, 'upper_bound': 14102561.5, 'count': 27056.0}
{'lower_bound': 14102561.5, 'upper_bound': 14102653.8, 'count': 30800.0}
{'lower_bound': 14102653.8, 'upper_bound': 14102746.1, 'count': 25736.0}
{'lower_bound': 14102746.1, 'upper_bound': 14102838.4, 'count': 42344.0}
{'lower_bound': 14102838.4, 'upper_bound': 14102930.7, 'count': 30680.0}
{'lower_bound': 14102930.7, 'upper_bound': 14103023.0, 'count': 33696.0}


In [106]:
constraints_df = pd.json_normalize(baseline_job.suggested_constraints().body_dict["features"])
constraints_df.head(10)

Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative,string_constraints.domains
0,id,String,1.0,,
1,click,Integral,1.0,True,
2,hour,Integral,1.0,True,
3,C1,Integral,1.0,True,
4,banner_pos,Integral,1.0,True,
5,site_id,String,1.0,,
6,site_domain,String,1.0,,
7,site_category,String,1.0,,"[c0dd3be3, 28905ebd, 70fb0e29, a818d37a, 76b29..."
8,app_id,String,1.0,,
9,app_domain,String,1.0,,


In [96]:
from sagemaker.model_monitor import CronExpressionGenerator

monitor_schedule_name = 'ctr-prediction-monitoring'
s3_report_path = f's3://{bucket}/model_monitor/monitoring_report'
my_default_monitor.create_monitoring_schedule(
    monitor_schedule_name=monitor_schedule_name,
    endpoint_input=predictor.endpoint,
    output_s3_uri=s3_report_path,
    statistics=my_default_monitor.baseline_statistics(),
    constraints=my_default_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [100]:
desc_schedule_result = my_default_monitor.describe_schedule()
print("Schedule status: {}".format(desc_schedule_result["MonitoringScheduleStatus"]))

Schedule status: Scheduled


In [110]:
client = boto3.client('sagemaker', region_name=region)

mon_executions = client.list_monitoring_executions(MonitoringScheduleName=monitor_schedule_name, MaxResults=1)


for execution_summary in mon_executions['MonitoringExecutionSummaries']:
    job_name = execution_summary['ProcessingJobArn'].split('/')[1]
    print("ProcessingJob: {}".format(execution_summary['ProcessingJobArn'].split('/')[1]))
    
    print('MonitoringExecutionStatus: {} \n'.format(execution_summary['MonitoringExecutionStatus']))

    desc_analytics_job_result=client.describe_processing_job(ProcessingJobName=job_name)
    report_uri=desc_analytics_job_result['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
print('Report Uri: {}'.format(report_uri))

ProcessingJob: model-monitoring-202211231100-aec4e986d14bd00b0e273376
MonitoringExecutionStatus: Failed 

Report Uri: s3://ctr-prediction/model_monitor/monitoring_report/model-monitor-endopoint2022-11-23-09-21-12/ctr-prediction-monitoring/2022/11/23/11


In [111]:
mon_executions


{'MonitoringExecutionSummaries': [{'MonitoringScheduleName': 'ctr-prediction-monitoring',
   'ScheduledTime': datetime.datetime(2022, 11, 23, 20, 0, tzinfo=tzlocal()),
   'CreationTime': datetime.datetime(2022, 11, 23, 20, 0, 57, 296000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2022, 11, 23, 20, 7, 21, 762000, tzinfo=tzlocal()),
   'MonitoringExecutionStatus': 'Failed',
   'ProcessingJobArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:processing-job/model-monitoring-202211231100-aec4e986d14bd00b0e273376',
   'EndpointName': 'model-monitor-endopoint2022-11-23-09-21-12',
   'FailureReason': 'AlgorithmError: See job logs for more information',
   'MonitoringJobDefinitionName': 'data-quality-job-definition-2022-11-23-10-27-24-231',
   'MonitoringType': 'DataQuality'}],
 'ResponseMetadata': {'RequestId': '069c53a2-e696-495d-9bb0-4d98962ce662',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '069c53a2-e696-495d-9bb0-4d98962ce662',
   'content-type': 'app

In [50]:


runtime = boto3.Session().client('sagemaker-runtime')
model_list = []
prediction_list = []

with open('test_partial.csv') as f:
    for line in f:
        response = runtime.invoke_endpoint(EndpointName=model_endpoint_name, 
                                   ContentType='text/csv', 
                                   Body=line,
                                  Accept='application/json')
        df_pred = pd.read_csv(response['Body'], header=None, delimiter='\t')
        model = json.loads(df_pred[0][0])['result'][0]['model']
        prediction = json.loads(df_pred[0][0])['result'][0]['prediction']
        model_list.append(model)
        prediction_list.append(prediction)

In [112]:
pd.read_csv('../avazu-ctr-prediction/train_partial.csv')


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.001579e+19,0,14102100,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,19772,320,50,2227,0,687,100075,48
2,1.002948e+18,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,...,1,0,20596,320,50,2161,0,35,-1,157
3,1.004511e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,...,1,0,19743,320,50,2264,3,427,100000,61
4,1.005990e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15702,320,50,1722,0,35,-1,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,9.907099e+17,1,14103023,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,22104,320,50,2545,0,431,100084,221
404286,9.927968e+18,0,14103023,1005,0,85f751fd,c4e18dd6,50e219e0,396df801,2347f47a,...,1,0,23866,320,50,2736,0,33,-1,246
404287,9.949412e+17,1,14103023,1005,0,85f751fd,c4e18dd6,50e219e0,d36838b1,0e8616ad,...,1,2,23866,320,50,2736,0,33,100170,246
404288,9.965962e+17,1,14103023,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,...,1,0,23160,320,50,2667,0,47,-1,221


In [113]:
pd.read_csv('test_partial.csv')

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.169757e+19,0,14102515,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,-1,79
1,9.861494e+18,0,14102103,1005,0,85f751fd,c4e18dd6,50e219e0,54c5d545,2347f47a,...,1,0,17017,320,50,1873,3,39,-1,23
2,1.080744e+19,0,14102405,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,21981,320,50,2532,0,679,100077,48
3,1.207080e+19,0,14102702,1005,0,85f751fd,c4e18dd6,50e219e0,d8784af5,2347f47a,...,1,0,17875,320,50,2036,3,47,100034,156
4,1.618278e+18,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15705,320,50,1722,0,35,-1,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,3.948792e+18,0,14102223,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,...,1,0,21760,320,50,2502,0,35,-1,221
9995,5.029178e+18,0,14102309,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,22103,320,50,2545,0,39,-1,221
9996,9.721320e+18,0,14102716,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,19950,320,50,1800,3,167,100081,23
9997,7.447545e+18,0,14102404,1005,0,85f751fd,c4e18dd6,50e219e0,f53417e1,0e8616ad,...,1,2,4407,320,50,394,2,423,100039,15
