In [None]:
# Necessary imports
from datetime import date

import sagemaker
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

In [None]:
# Region/role setup
region = sagemaker.Session().boto_region_name
print(f"AWS Region: {region}")
role = sagemaker.get_execution_role()
print(f"RoleArn: {role}")

In [None]:
bucket = "aml-project-storage"
model_prefix = "model"
data_folder_prefix = "data"

dataset_stamp = "20210607"
today_stamp = str(date.today()).replace("-", "")

s3_output_location = "s3://{}/{}/{}/{}/".format(
    bucket, model_prefix, "xgboost", today_stamp
)
print(f"Output: {s3_output_location}")

s3_train_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "train", dataset_stamp
)
s3_test_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "test", dataset_stamp
)
s3_validation_folder = "s3://{}/{}/{}/{}/".format(
    bucket, data_folder_prefix, "validation", dataset_stamp
)
print(f"Train: {s3_train_folder}")
print(f"Test: {s3_test_folder}")
print(f"Validation: {s3_validation_folder}")

In [None]:
# Get the training container
container = sagemaker.image_uris.retrieve(
    "xgboost", region, "0.90-1"
)  # chose 0.90-1, as the latest has problems with aucpr metric
print(container)

In [None]:
xgb_model = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())],
)

In [None]:
xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,  # learning_rate
    objective="binary:logistic",
    num_round=50,
    eval_metric="aucpr",
    scale_pos_weight=10,  # Balance positive/negative weights
)

In [None]:
train_input = TrainingInput(
    s3_train_folder, content_type="csv", s3_data_type="S3Prefix"
)
validation_input = TrainingInput(
    s3_validation_folder, content_type="csv", s3_data_type="S3Prefix"
)

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)