In [15]:
import yaml
import sagemaker
import boto3
import json

SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "../avazu-ctr-prediction"

# AWS リソース設定
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')


In [2]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split


# train, validation, test データを用意
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")
df_train, df_test = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)

In [3]:
# S3にアップロード
prefix = 'sagemaker-ab-testing'

train_file = "train.csv"
test_file = "test.csv"

df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)

s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file(train_file)
s3_resource_bucket.Object(os.path.join(prefix, "test", test_file)).upload_file(test_file)


In [4]:
output_location = f"s3://{bucket}/{prefix}/output"

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"

## model A

In [None]:
from time import gmtime, strftime

from sagemaker.sklearn.estimator import SKLearn

job_name = "modelA-training-job" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training = False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "train.py",
    "source_dir": "modelA",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

model_a_estimator = SKLearn(**estimator_parameters)
model_a_estimator.fit(inputs)


In [245]:
from sagemaker.sklearn.model import SKLearnModel

modelA = SKLearnModel(
    role=role,
    model_data=model_a_estimator.model_data,
    framework_version="0.23-1",
    py_version="py3",
    source_dir="modelA",
    entry_point="inference.py",
    sagemaker_session=sess
)

model_a_name = "{}-{}".format("modelA", timestamp)


sess.create_model(
    model_a_name,
    role,
    modelA.prepare_container_def(
        instance_type='ml.t2.medium'
    )
)

'modelA-1656375905'

## modelB

In [None]:
from time import gmtime, strftime

from sagemaker.sklearn.estimator import SKLearn

job_name = "modelB-training-job" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training = False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "train.py",
    "source_dir": "modelB",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

model_b_estimator = SKLearn(**estimator_parameters)
model_b_estimator.fit(inputs)


In [246]:
from sagemaker.sklearn.model import SKLearnModel

modelB = SKLearnModel(
    role=role,
    model_data=model_b_estimator.model_data,
    framework_version="0.23-1",
    py_version="py3",
    source_dir="modelB",
    entry_point="inference.py",
    sagemaker_session=sess
)

model_b_name = "{}-{}".format("modelB", timestamp)

sess.create_model(
    model_b_name,
    role,
    modelB.prepare_container_def(
        instance_type='ml.t2.medium'
    )
)

'modelB-1656375905'

## ABテスト

In [247]:
from sagemaker.session import production_variant
import time

timestamp = "{}".format(int(time.time()))

endpoint_config_name = "{}-{}".format("abtest", timestamp)

modelA_variant = production_variant(
    model_name=model_a_name,
    instance_type="ml.t2.medium",
    initial_instance_count=1,
    variant_name="VariantA",
    initial_weight=50,
)

modelB_variant = production_variant(
    model_name=model_b_name,
    instance_type="ml.t2.medium",
    initial_instance_count=1,
    variant_name="VariantB",
    initial_weight=50,
)


In [248]:
model_ab_endpoint_name = "{}-{}".format("abtest", timestamp)

endpoint_config = sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, ProductionVariants=[modelA_variant, modelB_variant]
)

endpoint_response = sm.create_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=endpoint_config_name)

In [249]:
model_ab_endpoint_name

'abtest-1656383806'

In [288]:
runtime = boto3.Session().client('sagemaker-runtime')
model_list = []
prediction_list = []

with open('test_partial.csv') as f:
    for line in f:
        response = runtime.invoke_endpoint(EndpointName=model_ab_endpoint_name, 
                                   ContentType='text/csv', 
                                   Body=line,
                                  Accept='application/json')
        df_pred = pd.read_csv(response['Body'], header=None, delimiter='\t')
        model = json.loads(df_pred[0][0])['result'][0]['model']
        prediction = json.loads(df_pred[0][0])['result'][0]['prediction']
        model_list.append(model)
        prediction_list.append(prediction)

In [289]:
print("[modelA] response:{}, bucket: {}".format(model_list.count('modelA'), model_list.count('modelA') / len(model_list)))
print("[modelB] response:{}, bucket: {}".format(model_list.count('modelB'), model_list.count('modelB') / len(model_list)))

[modelA] response:5073, bucket: 0.5073
[modelB] response:4927, bucket: 0.4927
