## Install SageMaker core

This notebook assumes a virtual environment containing sagemaker-core. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env AWS_PROFILE=sagemaker-role

env: AWS_PROFILE=sagemaker-role


In [48]:
DEFAULT_INSTANCE = "ml.m5.xlarge"

In [36]:
from corelab.core.session import CoreLabSession

In [45]:
lab_session = CoreLabSession('xgboost', 'customer-churn', default_folder='core_notebook', create_run_folder=True)
lab_session.print()
core_session = lab_session.core_session

AWS region: eu-central-1
Execution role arn:aws:iam::136548476532:role/service-role/AmazonSageMaker-ExecutionRole-20250902T164316
Output bucket uri: s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-09-08T12-11-23
Framework: xgboost
Project name: customer-churn


In [5]:
from io import StringIO
import pandas as pd

data = core_session.read_s3_file(f"sagemaker-example-files-prod-{lab_session.region}", "datasets/tabular/synthetic/churn.txt")

df = pd.read_csv(StringIO(data))
df

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,...,4,6.508639,4.065759,100,5.111624,4.928160,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,...,0,9.972592,7.141040,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.708490,3,4.768160,...,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,...,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,...,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,NH,4,787,151-3162,yes,yes,800,10.862632,5,7.250969,...,1,8.026482,4.921314,350,6.748489,4.872570,8,2.122530,9,False.
4996,SD,140,836,351-5993,no,no,0,1.581127,8,3.758307,...,7,1.328827,0.939932,300,4.522661,6.938571,2,4.600473,4,False.
4997,SC,32,836,370-3127,no,yes,700,0.163836,5,4.243980,...,3,2.340554,0.939469,450,5.157898,4.388328,7,1.060340,6,False.
4998,MA,142,776,604-2108,yes,yes,600,2.034454,5,3.014859,...,3,3.470372,6.076043,150,4.362780,7.173376,3,4.871900,7,True.


In [6]:
from sklearn.model_selection import train_test_split

# Phone number is unique - will not add value to classifier
df = df.drop("Phone", axis=1)

# Cast Area Code to non-numeric
df["Area Code"] = df["Area Code"].astype(object)

# Remove one feature from highly corelated pairs
df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

# One-hot encode catagorical features into numeric features
model_data = pd.get_dummies(df)
model_data = pd.concat(
    [
        model_data["Churn?_True."],
        model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
    ],
    axis=1,
)
model_data = model_data.astype(float)

# Split data into train and validation datasets
train_data, validation_data = train_test_split(model_data, test_size=0.33, random_state=42)

# Further split the validation dataset into test and validation datasets.
validation_data, test_data = train_test_split(validation_data, test_size=0.33, random_state=42)

# Remove and store the target column for the test data. This is used for calculating performance metrics after training, on unseen data.
test_target_column = test_data["Churn?_True."]
test_data.drop(["Churn?_True."], axis=1, inplace=True)

# Store all datasets locally
train_data.to_csv("train.csv", header=False, index=False)
validation_data.to_csv("validation.csv", header=False, index=False)
test_data.to_csv("test.csv", header=False, index=False)

# Upload each dataset to S3
s3_train_input = core_session.upload_data("train.csv")
s3_validation_input = core_session.upload_data("validation.csv")
s3_test_input = core_session.upload_data("test.csv")

print("Datasets uploaded to:")
print(s3_train_input)
print(s3_validation_input)
print(s3_test_input)

Datasets uploaded to:
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-09-08T09-54-35/data/train.csv
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-09-08T09-54-35/data/validation.csv
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-09-08T09-54-35/data/test.csv


In [7]:
image = lab_session.retrieve_image('xgboost', '1.7-1')
image

'492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:1.7-1'

In [8]:
lab_session.base_s3_uri

's3://sagemaker-eu-central-1-136548476532/core_notebook/2025-09-08T09-54-35'

## Generic shapes for Training and Tuning jobs

In [19]:
algorithm_spec = AlgorithmSpecification(training_image=image, training_input_mode="File")

channel_train = Channel(
            channel_name="train",
            content_type="csv",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=s3_train_input,
                    s3_data_distribution_type="FullyReplicated")))

channel_validation = Channel(
            channel_name="validation",
            content_type="csv",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=s3_validation_input,
                    s3_data_distribution_type="FullyReplicated")))

output_data_config = OutputDataConfig(s3_output_path=lab_session.base_s3_uri)

training_instance_config = ResourceConfig(instance_type=DEFAULT_INSTANCE, instance_count=1, volume_size_in_gb=30)

In [11]:
import time
from sagemaker_core.resources import TrainingJob
from sagemaker_core.shapes import (
    AlgorithmSpecification,
    Channel,
    DataSource,
    S3DataSource,
    ResourceConfig,
    StoppingCondition,
    OutputDataConfig,
)

# Specify hyperparameters
HYPER_PARAMS = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.8",
    "verbosity": "0",
    "objective": "binary:logistic",
    "num_round": "100",
}

# Create training job.
training_job = TrainingJob.create(
    training_job_name=lab_session.training_job_name,
    hyper_parameters=HYPER_PARAMS,
    algorithm_specification=algorithm_spec,
    role_arn=lab_session.role,
    input_data_config=[
        channel_train,
        channel_validation
    ],
    output_data_config=output_data_config,
    resource_config=training_instance_config,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=600),
)

# Wait for the training job to complete
training_job.wait()

  task: The machine learning task your model package accomplishes. Common machine learning tasks include object detection and image classification. The following tasks are supported by Inference Recommender: "IMAGE_CLASSIFICATION" \| "OBJECT_DETECTION" \| "TEXT_GENERATION" \|"IMAGE_SEGMENTATION" \| "FILL_MASK" \| "CLASSIFICATION" \| "REGRESSION" \| "OTHER". Specify "OTHER" if none of the tasks listed fit your use case.
  billable_time_in_seconds: The billable time in seconds. Billable time refers to the absolute wall-clock time. Multiply BillableTimeInSeconds by the number of instances (InstanceCount) in your training cluster to get the total compute time SageMaker bills you if you run distributed training. The formula is as follows: BillableTimeInSeconds \* InstanceCount . You can calculate the savings from using managed spot training using the formula (1 - BillableTimeInSeconds / TrainingTimeInSeconds) \* 100. For example, if BillableTimeInSeconds is 100 and TrainingTimeInSeconds is 

In [None]:
print("Model artifacts:", training_job.model_artifacts.s3_model_artifacts)
print("Training job ARN:", training_job.training_job_arn)

## Hyper Paramater Tuning

In [20]:
from sagemaker_core.resources import HyperParameterTuningJob
from sagemaker_core.shapes import (
    HyperParameterTuningJobConfig,
    ResourceLimits,
    ParameterRanges,
    AutoParameter,
    Autotune,
    HyperParameterTrainingJobDefinition,
    HyperParameterTuningJobObjective,
    HyperParameterAlgorithmSpecification,
    OutputDataConfig,
    StoppingCondition,
    ResourceConfig,
)

max_runtime_in_seconds = 3600  # Maximum runtime for tuning job.

# Create HyperParameterTrainingJobDefinition object, setting up a general spec 
hyper_parameter_training_job_defintion = HyperParameterTrainingJobDefinition(
    name=lab_session.tuning_job_definition_name,
    role_arn=lab_session.role,
    algorithm_specification=HyperParameterAlgorithmSpecification(
        training_image=image, training_input_mode="File"
    ),
    input_data_config=[
        channel_train,
        channel_validation
    ],
    output_data_config=output_data_config,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=900), # per job
    resource_config=training_instance_config
)

# Create HyperParameterTrainingJobDefinition object, containing information about the tuning job
tuning_job_config = HyperParameterTuningJobConfig(
    strategy="Bayesian",
    hyper_parameter_tuning_job_objective=HyperParameterTuningJobObjective(
        type="Maximize", metric_name="validation:auc"
    ),
    resource_limits=ResourceLimits(
        max_number_of_training_jobs=10,
        max_parallel_training_jobs=5,
        max_runtime_in_seconds=3600, # total timeout
    ),
    training_job_early_stopping_type="Auto",
    parameter_ranges=ParameterRanges(
        auto_parameters=[
            AutoParameter(name="max_depth", value_hint="5"),
            AutoParameter(name="eta", value_hint="0.1"),
            AutoParameter(name="gamma", value_hint="8"),
            AutoParameter(name="min_child_weight", value_hint="2"),
            AutoParameter(name="subsample", value_hint="0.5"),
            AutoParameter(name="num_round", value_hint="50"),
        ]
    ),
)

# Create the tuning job using the 2 configuration objects above
tuning_job = HyperParameterTuningJob.create(
    hyper_parameter_tuning_job_name=lab_session.tuning_job_name,
    autotune=Autotune(mode="Enabled"),
    training_job_definition=hyper_parameter_training_job_defintion,
    hyper_parameter_tuning_job_config=tuning_job_config,
)

tuning_job.wait()

In [26]:
# Get the best training job name
best_job_name = tuning_job.best_training_job.training_job_name
metric = tuning_job.best_training_job.final_hyper_parameter_tuning_job_objective_metric
print(f"Best training job: {best_job_name}")
print("Objective metric:", metric.metric_name, "=", metric.value)
# Get the best training job object
best_training_job = TrainingJob.get(best_job_name)

# Print the hyperparameters
print("Best hyperparameters:")
for param_name, param_value in best_training_job.hyper_parameters.items():
    print(f"  {param_name}: {param_value}")

Best training job: xgboost-tune-2025-09-08T10-16-41-006-d57b2517
Objective metric: validation:auc = 0.9547200202941895
Best hyperparameters:
  _tuning_objective_metric: validation:auc
  eta: 0.72511675067351
  gamma: 0.3297832294245818
  max_depth: 8
  min_child_weight: 22.719358635534608
  num_round: 480
  subsample: 0.6855076401885574


## Create Model

In [46]:
from sagemaker_core.resources import Model
from sagemaker_core.shapes import ContainerDefinition

# model_s3_uri = training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts from the training job.
model_s3_uri = best_training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts of the best model from the tuning job.

# Create SageMaker model: An image along with the model artifact to use.
customer_churn_model = Model.create(
    model_name=lab_session.model_name,
    primary_container=ContainerDefinition(image=image, model_data_url=model_s3_uri),
    execution_role_arn=lab_session.role,
)

In [50]:
from sagemaker_core.resources import TransformJob
from sagemaker_core.shapes import (
    TransformInput,
    TransformDataSource,
    TransformS3DataSource,
    TransformOutput,
    TransformResources,
)

transform_input = TransformInput(
        data_source=TransformDataSource(
            s3_data_source=TransformS3DataSource(s3_data_type="S3Prefix", s3_uri=s3_test_input)
        ),
        content_type="text/csv")

# Create Transform Job.
transform_job = TransformJob.create(
    transform_job_name=lab_session.training_job_name,
    model_name=lab_session.model_name,
    transform_input=transform_input,
    transform_output=TransformOutput(s3_output_path=lab_session.transform_output_s3_uri),
    transform_resources=TransformResources(
        instance_type=DEFAULT_INSTANCE, instance_count=1
    ),
)

transform_job.wait()