## Config

In [11]:
# Complete SageMaker XGBoost Examples with California Housing Dataset
# Uses three different estimator types with separate S3 buckets

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
import boto3
import sagemaker
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.sklearn.estimator import SKLearn
import os

In [2]:
# Setup SageMaker session and role
session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = session.boto_region_name
# bucket_base = 'sagemaker-ap-southeast-1-215470142970'
bucket_base = "sagemaker-ap-southeast-1-215470142970"

# S3 bucket paths for each estimator type
builtin_bucket = f's3://{bucket_base}/estimator-builtin'
framework_bucket = f's3://{bucket_base}/estimator-framework'
custom_bucket = f's3://{bucket_base}/estimator-custom'

print(f"Region: {region}")
print(f"Built-in bucket: {builtin_bucket}")
print(f"Framework bucket: {framework_bucket}")
print(f"Custom bucket: {custom_bucket}")

Region: ap-southeast-1
Built-in bucket: s3://sagemaker-ap-southeast-1-215470142970/estimator-builtin
Framework bucket: s3://sagemaker-ap-southeast-1-215470142970/estimator-framework
Custom bucket: s3://sagemaker-ap-southeast-1-215470142970/estimator-custom


In [3]:
# Load California housing dataset
print("Loading California housing dataset...")
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name='target')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")


Loading California housing dataset...
Training set size: (12384, 8)
Validation set size: (4128, 8)
Test set size: (4128, 8)


## Built-in Estimators

In [4]:
def prepare_data_for_builtin():
    """Prepare data in libsvm format for built-in XGBoost"""
    print("Preparing data for built-in estimator (libsvm format)...")

    # Create temporary files
    train_libsvm = 'train.libsvm'
    val_libsvm = 'val.libsvm'

    # Convert to libsvm format
    dump_svmlight_file(X_train, y_train, train_libsvm)
    dump_svmlight_file(X_val, y_val, val_libsvm)

    # Upload to S3 using default bucket with prefix
    train_s3_path = f'{builtin_bucket}/data/train/train.libsvm'
    val_s3_path = f'{builtin_bucket}/data/validation/validation.libsvm'

    # Use default_bucket (just the bucket name) and include the prefix in key_prefix
    session.upload_data(path=train_libsvm, bucket=bucket_base, key_prefix='estimator-builtin/data/train')
    session.upload_data(path=val_libsvm, bucket=bucket_base, key_prefix='estimator-builtin/data/validation')

    # Clean up local files
    os.remove(train_libsvm)
    os.remove(val_libsvm)

    return train_s3_path, val_s3_path

In [5]:
def run_builtin_estimator():
    """Run XGBoost using built-in algorithm estimator"""
    print("\n" + "="*50)
    print("RUNNING BUILT-IN ALGORITHM ESTIMATOR")
    print("="*50)

    # Prepare data
    train_s3, val_s3 = prepare_data_for_builtin()

    # Get the built-in XGBoost container image
    container = image_uris.retrieve('xgboost', region, version='1.5-1')

    # Create built-in XGBoost estimator
    xgb_builtin = Estimator(
        image_uri=container,
        role=role,
        instance_count=1,
        instance_type='ml.m5.xlarge',
        output_path=f'{builtin_bucket}/output',
        base_job_name='xgboost-builtin-housing'
    )

    # Set hyperparameters
    xgb_builtin.set_hyperparameters(
        objective='reg:squarederror',
        num_round=100,
        max_depth=5,
        eta=0.2,
        subsample=0.9,
        colsample_bytree=0.8,
        eval_metric='rmse'
    )

    print("Starting training for built-in estimator...")

    # Train the model
    xgb_builtin.fit({
        'train': f'{builtin_bucket}/data/train',
        'validation': f'{builtin_bucket}/data/validation'
    })

    print(f"Built-in model training completed!")
    print(f"Model artifacts saved to: {builtin_bucket}/output")

    return xgb_builtin

In [6]:
run_builtin_estimator()


RUNNING BUILT-IN ALGORITHM ESTIMATOR
Preparing data for built-in estimator (libsvm format)...


INFO:sagemaker:Creating training-job with name: xgboost-builtin-housing-2025-09-27-11-46-53-483


Starting training for built-in estimator...
2025-09-27 11:46:56 Starting - Starting the training job...
2025-09-27 11:47:09 Starting - Preparing the instances for training...
2025-09-27 11:47:49 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-09-27 11:48:52.922 ip-10-0-116-239.ap-southeast-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-09-27 11:48:52.944 ip-10-0-116-239.ap-southeast-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-09-27:11:48:53:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-09-27:11:48:53:INFO] Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-27:11:48:53:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-27:11:48:53:INFO] No GPUs detected

<sagemaker.estimator.Estimator at 0x7fcf3a53e080>

## Framework Estimator (XGBoost with Scikit-learn)
Using the Scikit-learn framework with your custom XGBoost training script:

In [7]:
!pip install xgboost scikit-learn pandas joblib

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.5-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m60.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.5


In [8]:
def prepare_data_for_framework():
    """Prepare data in CSV format for framework estimator"""
    print("Preparing data for framework estimator (CSV format)...")

    # Create CSV files with target column
    train_df = pd.concat([X_train, y_train], axis=1)
    val_df = pd.concat([X_val, y_val], axis=1)

    # Save to CSV
    train_df.to_csv('train.csv', index=False)
    val_df.to_csv('validation.csv', index=False)

    # Upload to S3 using default bucket with prefix
    train_s3_path = session.upload_data(path='train.csv', bucket=bucket_base, key_prefix='estimator-framework/data/train')
    val_s3_path = session.upload_data(path='validation.csv', bucket=bucket_base, key_prefix='estimator-framework/data/validation')

    # Clean up local files
    os.remove('train.csv')
    os.remove('validation.csv')

    return train_s3_path, val_s3_path

In [9]:
def run_framework_estimator():
    """Run RandomForest using framework estimator (Scikit-learn)"""
    print("\n" + "="*50)
    print("RUNNING FRAMEWORK ESTIMATOR (SCIKIT-LEARN)")
    print("="*50)

    # Prepare data
    train_s3, val_s3 = prepare_data_for_framework()

    # Create framework estimator
    sklearn_estimator = SKLearn(
        entry_point='train_framework.py',
        role=role,
        instance_type='ml.m5.xlarge',
        framework_version='1.0-1',
        py_version='py3',
        output_path=f'{framework_bucket}/output',
        base_job_name='randomforest-framework-housing',
        hyperparameters={
            'n-estimators': 150,
            'max-depth': 10,
            'min-samples-split': 5,
            'min-samples-leaf': 2,
            'random-state': 42
        }
    )

    print("Starting training for framework estimator...")

    # Train the model
    sklearn_estimator.fit({
        'train': f'{framework_bucket}/data/train',
        'validation': f'{framework_bucket}/data/validation'
    })

    print(f"Framework model training completed!")
    print(f"Model artifacts saved to: {framework_bucket}/output")

    return sklearn_estimator

In [10]:
run_framework_estimator()


RUNNING FRAMEWORK ESTIMATOR (SCIKIT-LEARN)
Preparing data for framework estimator (CSV format)...


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: randomforest-framework-housing-2025-09-27-11-52-54-104


Starting training for framework estimator...
2025-09-27 11:52:55 Starting - Starting the training job...
2025-09-27 11:53:28 Downloading - Downloading input data...
2025-09-27 11:53:48 Downloading - Downloading the training image......
2025-09-27 11:55:00 Training - Training image download completed. Training in progress.
2025-09-27 11:55:00 Uploading - Uploading generated training model[34m2025-09-27 11:54:45,611 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-09-27 11:54:45,615 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-09-27 11:54:45,618 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-09-27 11:54:45,632 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-09-27 11:54:45,875 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-09-27 11:54:45,87

<sagemaker.sklearn.estimator.SKLearn at 0x7fcf42adaf50>

## Custom Estimator

In [15]:
def prepare_data_for_custom():
    """Prepare data for custom estimator"""
    print("Preparing data for custom estimator (CSV format)...")

    # Create CSV files
    train_df = pd.concat([X_train, y_train], axis=1)
    val_df = pd.concat([X_val, y_val], axis=1)

    # Save to CSV
    train_df.to_csv('train_custom.csv', index=False)
    val_df.to_csv('validation_custom.csv', index=False)

    # Upload to S3 using default bucket with prefix
    train_s3_path = session.upload_data(path='train_custom.csv', bucket=bucket_base, key_prefix='estimator-custom/data/train')
    val_s3_path = session.upload_data(path='validation_custom.csv', bucket=bucket_base, key_prefix='estimator-custom/data/validation')

    # Clean up local files
    os.remove('train_custom.csv')
    os.remove('validation_custom.csv')

    return train_s3_path, val_s3_path

In [None]:
# def run_custom_estimator():
#     """Run XGBoost using custom estimator (requires Docker build)"""
#     print("\n" + "="*50)
#     print("RUNNING CUSTOM ESTIMATOR")
#     print("="*50)

#     # Prepare data
#     train_s3, val_s3 = prepare_data_for_custom()

#     # Create training script and Dockerfile
#     with open('train_custom.py', 'w') as f:
#         f.write(train_script_custom)

#     with open('Dockerfile', 'w') as f:
#         f.write(dockerfile_content)

#     print("Created training script and Dockerfile for custom estimator")
#     print("\nTo use the custom estimator, you need to:")
#     print("1. Build the Docker image:")
#     print("   docker build -t custom-xgboost-housing .")
#     print("2. Tag and push to ECR:")
#     print(f"   aws ecr create-repository --repository-name custom-xgboost-housing --region {region}")
#     print("   aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com")
#     print("   docker tag custom-xgboost-housing:latest {account_id}.dkr.ecr.{region}.amazonaws.com/custom-xgboost-housing:latest")
#     print("   docker push {account_id}.dkr.ecr.{region}.amazonaws.com/custom-xgboost-housing:latest")
#     print("3. Then run the custom estimator code below")

#     # For demonstration, here's how you would create the custom estimator
#     # (This assumes the Docker image has been built and pushed to ECR)

#     account_id = boto3.client('sts').get_caller_identity().get('Account')
#     image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/custom-xgboost-housing:latest"

#     print(f"\nCustom estimator code (run after Docker setup):")
#     print(f"""
# custom_estimator = Estimator(
#     image_uri='{image_uri}',
#     role='{role}',
#     instance_count=1,
#     instance_type='ml.m5.xlarge',
#     output_path='{custom_bucket}/output',
#     base_job_name='xgboost-custom-housing',
#     hyperparameters={{
#         'max-depth': 8,
#         'learning-rate': 0.1,
#         'n-estimators': 200,
#         'custom-preprocessing': 'advanced',
#         'feature-engineering': 'advanced'
#     }}
# )

# # Train the model
# custom_estimator.fit({{
#     'train': '{custom_bucket}/data/train',
#     'validation': '{custom_bucket}/data/validation'
# }})
# """)
    
#     return None  # Return None since we can't actually run without Docker setup


In [19]:
def run_custom_estimator():
    """Run XGBoost using custom estimator (assumes Docker image is already built and pushed)"""
    print("\n" + "="*50)
    print("RUNNING CUSTOM ESTIMATOR")
    print("="*50)
    
    # Prepare data
    train_s3, val_s3 = prepare_data_for_custom()
    
    # Get account ID for ECR image URI
    account_id = boto3.client('sts').get_caller_identity().get('Account')
    image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/custom-xgboost-housing:latest" ##ECR

    print(f"Using custom Docker image: {image_uri}")

    # try:
    # Create the custom estimator
    custom_estimator = Estimator(
        image_uri=image_uri,
        role=role,
        instance_count=1,
        instance_type='ml.m5.xlarge',
        output_path=f'{custom_bucket}/output',
        base_job_name='xgboost-custom-housing',
        hyperparameters={
            'max-depth': 8,
            'learning-rate': 0.1,
            'n-estimators': 200,
            'custom-preprocessing': 'advanced',
            'feature-engineering': 'advanced'
        }
    )

    print("Starting training with custom estimator...")

    # Train the model
    custom_estimator.fit({
        'train': f'{custom_bucket}/data/train',
        'validation': f'{custom_bucket}/data/validation'
    })

    print(f"Custom estimator training completed!")
    print(f"Model artifacts saved to: {custom_bucket}/output")

    return custom_estimator

    # except Exception as e:
    #     print(f"Error running custom estimator: {str(e)}")
    #     print("\nMake sure you have:")
    #     print("1. Built and pushed the Docker image to ECR")
    #     print("2. The image URI is accessible from your account")
    #     print("3. SageMaker has permissions to pull from ECR")

            
    #     print("\nCreated train_custom.py and Dockerfile for manual Docker build")
    # return None

In [20]:
run_custom_estimator()


RUNNING CUSTOM ESTIMATOR
Preparing data for custom estimator (CSV format)...


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: xgboost-custom-housing-2025-09-27-12-33-00-770


Using custom Docker image: 215470142970.dkr.ecr.ap-southeast-1.amazonaws.com/custom-xgboost-housing:latest
Starting training with custom estimator...


ERROR:sagemaker:Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-training-job
