## Install SageMaker core

This notebook assumes a virtual environment containing sagemaker-core. In SageMaker studio start a terminal and run `uv pip install -e .` from the git project root:
```
sagemaker-user@default:~/sm-core-lab/uv pip install -e .
```

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
DEFAULT_INSTANCE = "ml.m5.large"

In [3]:
from corelab.core.session import CoreLabSession
lab_session = CoreLabSession('xgboost', 'customer-churn', default_folder='core_notebook', create_run_folder=True, aws_profile='sagemaker-role')
lab_session.print()
core_session = lab_session.core_session

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/machiel/Library/Application Support/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name machiel-crystalline to get Role path.


falling back to profile: sagemaker-role
AWS region: eu-central-1
Execution role arn:aws:iam::136548476532:role/service-role/AmazonSageMaker-ExecutionRole-20250902T164316
Output bucket uri: s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-10-01T13-50-56
Framework: xgboost
Project name: customer-churn


In [4]:
from sklearn.model_selection import train_test_split
from io import StringIO
import pandas as pd

data = core_session.read_s3_file(f"sagemaker-example-files-prod-{lab_session.region}", "datasets/tabular/synthetic/churn.txt")

df = pd.read_csv(StringIO(data))

# Phone number is unique - will not add value to classifier
df = df.drop("Phone", axis=1)

# Cast Area Code to non-numeric
df["Area Code"] = df["Area Code"].astype(object)

# Remove one feature from highly corelated pairs
df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

# One-hot encode catagorical features into numeric features
model_data = pd.get_dummies(df)
model_data = pd.concat(
    [
        model_data["Churn?_True."],
        model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
    ],
    axis=1,
)
model_data = model_data.astype(float)

# Split data into train and validation datasets
train_data, validation_data = train_test_split(model_data, test_size=0.33, random_state=42)

# Further split the validation dataset into test and validation datasets.
validation_data, test_data = train_test_split(validation_data, test_size=0.33, random_state=42)

# Remove and store the target column for the test data. This is used for calculating performance metrics after training, on unseen data.
test_target_column = test_data["Churn?_True."]
test_data.drop(["Churn?_True."], axis=1, inplace=True)

# Store all datasets locally
train_data.to_csv("train.csv", header=False, index=False)
validation_data.to_csv("validation.csv", header=False, index=False)
test_data.to_csv("test.csv", header=False, index=False)

# Upload each dataset to S3
s3_train_input = core_session.upload_data("train.csv")
s3_validation_input = core_session.upload_data("validation.csv")
s3_test_input = core_session.upload_data("test.csv")

print("Datasets uploaded to:")
print(s3_train_input)
print(s3_validation_input)
print(s3_test_input)

Datasets uploaded to:
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-10-01T13-50-56/data/train.csv
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-10-01T13-50-56/data/validation.csv
s3://sagemaker-eu-central-1-136548476532/core_notebook/2025-10-01T13-50-56/data/test.csv


In [5]:
image = lab_session.retrieve_image('1.7-1')
print("Using image:", image)

Using image: 492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:1.7-1


## Generic shapes for Training and Tuning jobs

Set up configuration for training and tuning jobs

In [6]:
from sagemaker_core.shapes import AlgorithmSpecification, OutputDataConfig, DataSource, S3DataSource, Channel, ResourceConfig

algorithm_spec = AlgorithmSpecification(training_image=image, training_input_mode="File")

channel_train = Channel(
            channel_name="train",
            content_type="csv",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=s3_train_input,
                    s3_data_distribution_type="FullyReplicated")))

channel_validation = Channel(
            channel_name="validation",
            content_type="csv",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=s3_validation_input,
                    s3_data_distribution_type="FullyReplicated")))

output_data_config = OutputDataConfig(s3_output_path=lab_session.jobs_output_s3_uri)

training_instance_config = ResourceConfig(instance_type=DEFAULT_INSTANCE, instance_count=1, volume_size_in_gb=30) # keep_alive_period_in_seconds=60*60

In [7]:
from sagemaker_core.resources import TrainingJob
from sagemaker_core.shapes import StoppingCondition

# Specify hyperparameters
HYPER_PARAMS = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.8",
    "verbosity": "0",
    "objective": "binary:logistic",
    "num_round": "100",
}

# Create training job.
training_job = TrainingJob.create(
    training_job_name=lab_session.training_job_name,
    hyper_parameters=HYPER_PARAMS,
    algorithm_specification=algorithm_spec,
    role_arn=lab_session.role,
    input_data_config=[
        channel_train,
        channel_validation
    ],
    output_data_config=output_data_config,
    resource_config=training_instance_config,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=600),
    session=lab_session.core_session.boto_session,
    region=lab_session.region
)

# Wait for the training job to complete
training_job.wait()

Output()

In [None]:
print("Model artifacts:", training_job.model_artifacts.s3_model_artifacts)
print("Training job ARN:", training_job.training_job_arn)

## Assignment 2 - Hyper Paramater Tuning (try the assignment first!)

In [1]:
from sagemaker_core.resources import HyperParameterTuningJob
from sagemaker_core.shapes import (
    HyperParameterTuningJobConfig,
    ResourceLimits,
    ParameterRanges,
    AutoParameter,
    Autotune,
    HyperParameterTrainingJobDefinition,
    HyperParameterTuningJobObjective,
    HyperParameterAlgorithmSpecification,
    StoppingCondition,
)

algo_spec = HyperParameterAlgorithmSpecification(training_image=image, training_input_mode="File")

# Create HyperParameterTrainingJobDefinition object, setting the core training job  
hyper_parameter_training_job_defintion = HyperParameterTrainingJobDefinition(
    role_arn=lab_session.role,
    algorithm_specification=algo_spec,
    input_data_config=[
        channel_train,
        channel_validation
    ],
    output_data_config=output_data_config,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=900),  # per job timout
    resource_config=training_instance_config,
    static_hyper_parameters={"objective": "binary:logistic"}
)

# Create HyperParameterTrainingJobDefinition object, setting the tuning strategy
tuning_job_config = HyperParameterTuningJobConfig(
    strategy="Bayesian",
    hyper_parameter_tuning_job_objective=HyperParameterTuningJobObjective(
        type="Maximize", metric_name="validation:auc"
    ),
    resource_limits=ResourceLimits(
        max_number_of_training_jobs=10,
        max_parallel_training_jobs=5,
        max_runtime_in_seconds=3600,  # total timeout
    ),
    training_job_early_stopping_type="Auto",
    parameter_ranges=ParameterRanges(
        auto_parameters=[
            AutoParameter(name="max_depth", value_hint="5"),
            AutoParameter(name="eta", value_hint="0.1"),
            AutoParameter(name="gamma", value_hint="8"),
            AutoParameter(name="min_child_weight", value_hint="2"),
            AutoParameter(name="subsample", value_hint="0.5"),
            AutoParameter(name="num_round", value_hint="50"),
        ]
    ),
)

# Create the tuning job using the 2 configuration objects above
tuning_job = HyperParameterTuningJob.create(
    hyper_parameter_tuning_job_name=lab_session.tuning_job_name,
    autotune=Autotune(mode="Enabled"),
    training_job_definition=hyper_parameter_training_job_defintion,
    hyper_parameter_tuning_job_config=tuning_job_config,
)

tuning_job.wait()

In [None]:
# Get the best training job name
best_job_name = tuning_job.best_training_job.training_job_name
metric = tuning_job.best_training_job.final_hyper_parameter_tuning_job_objective_metric
print(f"Best training job: {best_job_name}")
print("Objective metric:", metric.metric_name, "=", metric.value)
# Get the best training job object
best_training_job = TrainingJob.get(best_job_name)

# Print the hyperparameters
print("Best hyperparameters:")
for param_name, param_value in best_training_job.hyper_parameters.items():
    print(f"  {param_name}: {param_value}")

## Create Model
We assume one model is created/updated. A new model version/name would be set explicitly either as the next production release or a 'candidate'.

In [None]:
from sagemaker_core.resources import Model
from sagemaker_core.shapes import ContainerDefinition
from corelab.core.utils import try_delete

model_s3_uri = training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts from the training job.
# model_s3_uri = best_training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts of the best model from the tuning job.

try_delete(Model, lab_session.model_name)

# Create SageMaker model: An image along with the model artifact to use.
customer_churn_model = Model.create(
    model_name=lab_session.model_name,
    primary_container=ContainerDefinition(image=image, model_data_url=model_s3_uri),
    execution_role_arn=lab_session.role,
)

In [None]:
from sagemaker_core.resources import TransformJob
from sagemaker_core.shapes import (
    TransformInput,
    TransformDataSource,
    TransformS3DataSource,
    TransformOutput,
    TransformResources,
)

transform_input = TransformInput(
        data_source=TransformDataSource(
            s3_data_source=TransformS3DataSource(s3_data_type="S3Prefix", s3_uri=s3_test_input)
        ),
        content_type="text/csv")

# Create Transform Job.
transform_job = TransformJob.create(
    transform_job_name=lab_session.training_job_name,
    model_name=customer_churn_model.get_name(),
    transform_input=transform_input,
    transform_output=TransformOutput(s3_output_path=lab_session.transform_output_s3_uri),
    transform_resources=TransformResources(
        instance_type=DEFAULT_INSTANCE, instance_count=1
    ),
)

transform_job.wait()

## Endpoint creation

We assume one hosted endpoint will exist for the churn model.

In [None]:
from sagemaker_core.resources import Endpoint, EndpointConfig
from sagemaker_core.shapes import ProductionVariant
from corelab.core.utils import try_delete

try_delete(EndpointConfig, lab_session.endpoint_config_name)

endpoint_config = EndpointConfig.create(
    endpoint_config_name=lab_session.endpoint_config_name,
    production_variants=[
        ProductionVariant(
            variant_name="AllTraffic",
            model_name=customer_churn_model.get_name(),
            instance_type=DEFAULT_INSTANCE,
            initial_instance_count=1,
        )
    ],
)

try_delete(Endpoint, lab_session.endpoint_name)
sagemaker_endpoint = Endpoint.create(
    endpoint_name=lab_session.endpoint_name,
    endpoint_config_name=endpoint_config.get_name(),
)
sagemaker_endpoint.wait_for_status(target_status="InService")

## Provisioned vs Serverless Endpoints

The above creates a **provisioned endpoint** with dedicated instances that are always running. 

**Provisioned Endpoints:**
- ✅ Predictable performance and latency
- ✅ Good for consistent, high-volume traffic
- ❌ Always billing (even when idle)
- ❌ Fixed capacity

**Serverless Endpoints:**
- ✅ Pay only for actual inference time
- ✅ Automatic scaling (0 to max concurrency)
- ✅ No infrastructure management
- ❌ Cold start latency (first request)
- ❌ Limited to 6GB memory max

## Serverless Endpoint

In [None]:
from sagemaker_core.resources import Endpoint, EndpointConfig
from sagemaker_core.shapes import ProductionVariant, ProductionVariantServerlessConfig

# patch bug in sagemaker core
import sagemaker_core.main.utils as smutils
smutils.SPECIAL_SNAKE_TO_PASCAL_MAPPINGS = {
    "volume_size_in_g_b": "VolumeSizeInGB",
    "volume_size_in_gb": "VolumeSizeInGB",
    "memory_size_in_mb": "MemorySizeInMB",
    "supported_response_mime_types": "SupportedResponseMIMETypes",
}
# end patch

try_delete(EndpointConfig, lab_session.serverless_endpoint_config_name)

# Create serverless endpoint configuration
serverless_endpoint_config = EndpointConfig.create(
    endpoint_config_name=lab_session.serverless_endpoint_config_name,
    production_variants=[
        ProductionVariant(
            variant_name="ServerlessVariant",
            model_name=customer_churn_model.get_name(),
            serverless_config=ProductionVariantServerlessConfig(
                memory_size_in_mb=2048,    # 2GB memory (valid: 1024, 2048, 3072, 4096, 5120, 6144)
                max_concurrency=10,        # Handle up to 10 concurrent requests
                provisioned_concurrency=1  # Keep 1 instance warm (optional)
            )
        )
    ],
)

try_delete(Endpoint, lab_session.serverless_endpoint_name)

# Create serverless endpoint
serverless_endpoint = Endpoint.create(
    endpoint_name=lab_session.serverless_endpoint_name,
    endpoint_config_name=serverless_endpoint_config.get_name(),
)
serverless_endpoint.wait_for_status(target_status="InService")

## Invoke endpoint!

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def print_performance_metrics(probs, y, threshold=0.5):
    """Lightweight method for printing performance metrics"""

    predictions = (probs >= threshold).astype(int)

    # Compare predictions with the stored target
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions)
    recall = recall_score(y, predictions)
    roc_auc = roc_auc_score(y, probs)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"ROC AUC: {roc_auc}")

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# Create a predictor with automatic CSV handling
predictor = Predictor(
  endpoint_name=serverless_endpoint.endpoint_name,
  serializer=CSVSerializer(),
  deserializer=CSVDeserializer()
)

# Much cleaner!
result = predictor.predict(test_data)
len(result)

## 🧹 Cleanup Resources

To avoid unnecessary AWS charges, it's important to delete all created resources when you're finished with this experiment.

In [None]:
# Delete both endpoints (this will take a few minutes)

print("🗑️  Deleting provisioned endpoint...")
try:
    sagemaker_endpoint.delete()
    sagemaker_endpoint.wait_for_delete()
    print("✅ Provisioned endpoint deletion initiated")
except Exception as e:
    print(f"❌ Error deleting provisioned endpoint: {e}")

# Delete endpoint configurations
print("\n🗑️  Deleting endpoint configurations...")
try:
    endpoint_config.delete()
    print("✅ Provisioned endpoint config deleted")
except Exception as e:
    print(f"❌ Error deleting provisioned endpoint config: {e}")

try:
    serverless_endpoint_config.delete()
    print("✅ Serverless endpoint config deleted")
except Exception as e:
    print(f"❌ Error deleting serverless endpoint config: {e}")

print("\n✨ Cleanup completed!")
print("\n💰 Note: S3 storage costs will continue until you manually delete the bucket contents")
print(f"    Bucket location: {lab_session.base_s3_uri}")