## Training with a generic model library

Using the scikit-learn image we have more control over the libraries installed and

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from corelab.core.session import CoreLabSession

lab_session = CoreLabSession('sklearn', 'customer-churn', default_folder='sklearn_notebook', create_run_folder=True,
                             aws_profile='sagemaker-role')
lab_session.print()
core_session = lab_session.core_session

In [None]:
DEFAULT_INSTANCE = 'ml.m5.large'

In [None]:
from sklearn.model_selection import train_test_split
from io import StringIO
import pandas as pd

data = core_session.read_s3_file(f"sagemaker-example-files-prod-{lab_session.region}",
                                 "datasets/tabular/synthetic/churn.txt")

df = pd.read_csv(StringIO(data))

# Phone number is unique - will not add value to classifier
df = df.drop("Phone", axis=1)

# Cast Area Code to non-numeric
df["Area Code"] = df["Area Code"].astype(object)

# Remove one feature from highly corelated pairs
df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

# One-hot encode catagorical features into numeric features
model_data = pd.get_dummies(df)
model_data = pd.concat(
    [
        model_data["Churn?_True."],
        model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
    ],
    axis=1,
)
model_data = model_data.astype(float)

# Split data into train and validation datasets
train_data, validation_data = train_test_split(model_data, test_size=0.33, random_state=42)

# Further split the validation dataset into test and validation datasets.
validation_data, test_data = train_test_split(validation_data, test_size=0.33, random_state=42)

# Remove and store the target column for the test data. This is used for calculating performance metrics after training, on unseen data.
test_target_column = test_data["Churn?_True."]
test_data.drop(["Churn?_True."], axis=1, inplace=True)

# Store all datasets locally
train_data.to_csv("train.csv", header=True, index=False)
validation_data.to_csv("validation.csv", header=True, index=False)
test_data.to_csv("test.csv", header=True, index=False)

# Upload each dataset to S3
s3_train_input = core_session.upload_data("train.csv")
s3_validation_input = core_session.upload_data("validation.csv")
s3_test_input = core_session.upload_data("test.csv")

print("Datasets uploaded to:")
print(s3_train_input)
print(s3_validation_input)
print(s3_test_input)

In [None]:
from sagemaker_core.shapes import AlgorithmSpecification, OutputDataConfig, DataSource, S3DataSource, Channel, \
    ResourceConfig

image = lab_session.retrieve_image('1.2-1')

print("Docker image:", image)

algorithm_spec = AlgorithmSpecification(training_image=image, training_input_mode="File")

channel_train = Channel(
    channel_name="train",
    content_type="csv",
    data_source=DataSource(
        s3_data_source=S3DataSource(
            s3_data_type="S3Prefix",
            s3_uri=s3_train_input,
            s3_data_distribution_type="FullyReplicated")))

channel_validation = Channel(
    channel_name="validation",
    content_type="csv",
    data_source=DataSource(
        s3_data_source=S3DataSource(
            s3_data_type="S3Prefix",
            s3_uri=s3_validation_input,
            s3_data_distribution_type="FullyReplicated")))

output_data_config = OutputDataConfig(s3_output_path=lab_session.jobs_output_s3_uri)

training_instance_config = ResourceConfig(instance_type=DEFAULT_INSTANCE, instance_count=1,
                                          volume_size_in_gb=30)  # keep_alive_period_in_seconds=60*60

In [None]:
from sagemaker_core.main.shapes import StoppingCondition
from sagemaker_core.main.resources import TrainingJob
from sagemaker import fw_utils

!uv export --format=requirements.txt -o src/requirements.txt --no-dev -q --no-hashes

print("Requirements exported to src/requirements.txt")

# Uploading code + requirements.txt
upload_destination = lab_session.training_code_upload
uploaded_code = fw_utils.tar_and_upload_dir(
    session=lab_session.core_session.boto_session,  # or just sagemaker_session
    bucket=upload_destination.bucket,
    s3_key_prefix=upload_destination.prefix,
    script='train.py',  # Entry point script
    directory='./src'  # Directory containing your code
)
print("Uploaded code:", uploaded_code)

hyperparameters = {
    'sagemaker_program': uploaded_code.script_name,  # 'train.py'
    'sagemaker_submit_directory': uploaded_code.s3_prefix,  # S3 URI
    'n_estimators': '100',
    'max_depth': '10',
    'target_column': 'Churn?_True.'
}

# Create training job.
training_job = TrainingJob.create(
    training_job_name=lab_session.training_job_name,
    hyper_parameters=hyperparameters,
    algorithm_specification=algorithm_spec,
    role_arn=lab_session.role,
    input_data_config=[
        channel_train,
        channel_validation
    ],
    output_data_config=output_data_config,
    resource_config=training_instance_config,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=600),
    session=lab_session.core_session.boto_session,
    region=lab_session.region
)

# Wait for the training job to complete
training_job.wait()

In [9]:
from sagemaker_core.resources import Model
from sagemaker_core.shapes import ContainerDefinition
from corelab.core.utils import try_delete

!uv export --format=requirements.txt -o src/requirements.txt --no-dev -q --no-hashes

model_s3_uri = training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts from the training job.
# model_s3_uri = best_training_job.model_artifacts.s3_model_artifacts  # Get URI of model artifacts of the best model from the tuning job.

try_delete(Model, lab_session.model_name)

infer_destination = lab_session.inference_code_upload
uploaded_infer = fw_utils.tar_and_upload_dir(
    lab_session.core_session.boto_session,
    infer_destination.bucket,
    infer_destination.prefix,
    script='inference.py', directory='./src')
# Create SageMaker model: An image along with the model artifact to use.
customer_churn_model = Model.create(
    model_name=lab_session.model_name,
    primary_container=ContainerDefinition(image=image, model_data_url=model_s3_uri,
                                          environment={"SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": uploaded_infer.s3_prefix}),
    execution_role_arn=lab_session.role,
)
print("Model created:", customer_churn_model.model_arn)

customer-churn-sklearn deleted


Model created: model_name='customer-churn-sklearn' primary_container=ContainerDefinition(container_hostname=<sagemaker_core.main.utils.Unassigned object at 0x10a6a1550>, image='492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3', image_config=<sagemaker_core.main.utils.Unassigned object at 0x10a6a1550>, mode='SingleModel', model_data_url='s3://sagemaker-eu-central-1-136548476532/sklearn_notebook/2025-09-10T15-17-18/jobs/sklearn-2025-09-10T15-17-30/output/model.tar.gz', model_data_source=ModelDataSource(s3_data_source=S3ModelDataSource(s3_uri='s3://sagemaker-eu-central-1-136548476532/sklearn_notebook/2025-09-10T15-17-18/jobs/sklearn-2025-09-10T15-17-30/output/model.tar.gz', s3_data_type='S3Object', compression_type='Gzip', model_access_config=<sagemaker_core.main.utils.Unassigned object at 0x10a6a1550>, hub_access_config=<sagemaker_core.main.utils.Unassigned object at 0x10a6a1550>, manifest_s3_uri=<sagemaker_core.main.utils.Unassigned object at 0x10a6a1

In [10]:
from corelab.core.utils import try_delete
from sagemaker_core.resources import Endpoint, EndpointConfig
from sagemaker_core.shapes import ProductionVariant, ProductionVariantServerlessConfig

# patch bug in sagemaker core
import sagemaker_core.main.utils as smutils

smutils.SPECIAL_SNAKE_TO_PASCAL_MAPPINGS = {
    "volume_size_in_g_b": "VolumeSizeInGB",
    "volume_size_in_gb": "VolumeSizeInGB",
    "memory_size_in_mb": "MemorySizeInMB",
    "supported_response_mime_types": "SupportedResponseMIMETypes",
}
# end patch

try_delete(EndpointConfig, lab_session.serverless_endpoint_config_name)

# Create serverless endpoint configuration
serverless_endpoint_config = EndpointConfig.create(
    endpoint_config_name=lab_session.serverless_endpoint_config_name,
    production_variants=[
        ProductionVariant(
            variant_name="ServerlessVariant",
            model_name=customer_churn_model.get_name(),
            serverless_config=ProductionVariantServerlessConfig(
                memory_size_in_mb=2048,  # 2GB memory (valid: 1024, 2048, 3072, 4096, 5120, 6144)
                max_concurrency=10,  # Handle up to 10 concurrent requests
                provisioned_concurrency=1  # Keep 1 instance warm (optional)
            )
        )
    ],
)

try_delete(Endpoint, lab_session.serverless_endpoint_name)

# Create serverless endpoint
serverless_endpoint = Endpoint.create(
    endpoint_name=lab_session.serverless_endpoint_name,
    endpoint_config_name=serverless_endpoint_config.get_name(),
)
serverless_endpoint.wait_for_status(target_status="InService")

customer-churn-sklearn-serverless-config deleted


Output()

customer-churn-sklearn-serverless-endpoint deleted


Output()

In [1]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# Create a predictor with automatic CSV handling
predictor = Predictor(
  endpoint_name=serverless_endpoint.endpoint_name,
  serializer=CSVSerializer(),
  deserializer=CSVDeserializer()
)

# Much cleaner!
result = predictor.predict(test_data)
result

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/machiel/Library/Application Support/sagemaker/config.yaml
