# 0. Setup

In [None]:
# Necessary imports
from datetime import date

import sagemaker
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

In [None]:
# Region/role setup
region = sagemaker.Session().boto_region_name
print(f"AWS Region: {region}")
role = sagemaker.get_execution_role()
print(f"RoleArn: {role}")

In [None]:
bucket = "aml-project-storage"
model_prefix = "model"
data_folder_prefix = "data"

dataset_stamp = "20210607"
today_stamp = str(date.today()).replace("-", "")

s3_output_location = "s3://{}/{}/{}/{}/".format(
    bucket, model_prefix, "xgboost", today_stamp
)
print(f"Output: {s3_output_location}")

s3_train_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "train", dataset_stamp
)
s3_test_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "test", dataset_stamp
)
s3_validation_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "validation", dataset_stamp
)
print(f"Train: {s3_train_folder}")
print(f"Test: {s3_test_folder}")
print(f"Validation: {s3_validation_folder}")

In [None]:
# Get the training container
container = sagemaker.image_uris.retrieve(
    "xgboost", region, "0.90-1"
)  # chose 0.90-1, as the latest has problems with aucpr metric
print(container)

# 1. Training

In [None]:
xgb_model = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())],
)

In [None]:
train_input = TrainingInput(
    s3_train_folder, content_type="csv", s3_data_type="S3Prefix"
)
validation_input = TrainingInput(
    s3_validation_folder, content_type="csv", s3_data_type="S3Prefix"
)

In [None]:
# First run - without modyfigin scale_pos_weight (defualt 1)
xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,  # learning_rate
    objective="binary:logistic",
    num_round=50,
    eval_metric="aucpr",
)

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

In [None]:
# Test higher scale_pos_weight, less rounds
xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,  # learning_rate
    objective="binary:logistic",
    num_round=25,
    eval_metric="aucpr",
    scale_pos_weight=1000,  # Balance positive/negative weights
)
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

In [None]:
# Test even higher scale_pos_weight
xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,  # learning_rate
    objective="binary:logistic",
    num_round=25,
    eval_metric="aucpr",
    scale_pos_weight=10000,  # Balance positive/negative weights
)
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

In [None]:
# 2. Hyperparameter Tuning

In [None]:
tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {"MaxValue": "1", "MinValue": "0", "Name": "eta"},
            {"MaxValue": "2", "MinValue": "0", "Name": "alpha"},
            {"MaxValue": "10", "MinValue": "1", "Name": "min_child_weight"},
        ],
        "IntegerParameterRanges": [
            {"MaxValue": "10", "MinValue": "1", "Name": "max_depth"}
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 20, "MaxParallelTrainingJobs": 3},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
        "MetricName": "validation:aucpr",
        "Type": "Maximize",
    },
}

In [None]:
training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train_folder,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation_folder,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": s3_output_location},
    "ResourceConfig": {
        "InstanceCount": 2,
        "InstanceType": "ml.m4.xlarge",
        "VolumeSizeInGB": 10,
    },
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "aucpr",
        "num_round": "50",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
        "scale_pos_weight": "1000",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")
tuning_job_name = "XGBoost-aml-tune-pos-1000"
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
# Deploy the model (Optional)
from sagemaker.serializers import CSVSerializer

xgb_predictor = xgb_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    serializer=CSVSerializer(),
    endpoint_name="aml-project-prediction-endpoint",
)

In [None]:
# Create example fraud and normal records (taken from historical data)
example_fraud_record = "0.0,7970766.57,7970766.57,0.0,0.0,0,0,1,0,0,0,0"
example_normal_record = "1.12,92321.42,92320.29,0.0,0.0,0,0,1,0,0,0,0"

In [None]:
xgb_predictor.predict(example_normal_record)

In [None]:
xgb_predictor.predict(example_fraud_record)

In [None]:
# Delete the endpoint
xgb_predictor.delete_endpoint()

In [None]:
import json

# Alternative way of invoking the endpoint through boto
import boto3

client = boto3.client("sagemaker-runtime")
response = client.invoke_endpoint(
    EndpointName=xgb_predictor.endpoint_name,
    Body=bytes(example_fraud_record, encoding="utf-8"),
    ContentType="text/csv",
)

json.loads(response["Body"].read())