In [None]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Get the default SageMaker bucket name
default_bucket = sagemaker_session.default_bucket()

print(f"Default SageMaker bucket: {default_bucket}")


In [None]:
# S3 data URI where the training data is stored
S3_TRAIN_DATA_URI = f"s3://{default_bucket}/datasets/california_housing_train.csv"

# Custom output location for model artifacts
MODEL_OUTPUT_PATH = f"s3://{default_bucket}/models/california-housing/"

# Type of instance to use for training
INSTANCE_TYPE = "ml.m5.large"
# Number of instances to use for training
INSTANCE_COUNT = 1

# Get account ID
account_id = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
# SageMaker execution role
SAGEMAKER_ROLE = f"arn:aws:iam::{account_id}:role/SageMakerDefaultExecution"

print(f"SageMaker Role: {SAGEMAKER_ROLE}")
print(f"Training data URI: {S3_TRAIN_DATA_URI}")
print(f"Model output path: {MODEL_OUTPUT_PATH}")
print(f"Instance type: {INSTANCE_TYPE}")
print(f"Instance count: {INSTANCE_COUNT}")
print(f"Account ID: {account_id}")
print(f"SageMaker Role: {SAGEMAKER_ROLE}")


In [None]:

# Create SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point='train.py',              # Python script containing training code
    role=SAGEMAKER_ROLE,                 # IAM role for SageMaker to access AWS resources
    instance_type=INSTANCE_TYPE,         # Type of EC2 instance for training
    instance_count=INSTANCE_COUNT,       # Number of instances to use
    framework_version='1.2-1',           # Version of scikit-learn to use
    py_version='py3',                    # Python version for the training environment
    script_mode=True,                    # Use script mode for training (vs legacy mode)
    sagemaker_session=sagemaker_session, # Session for interacting with SageMaker
    output_path=MODEL_OUTPUT_PATH        # S3 location to save model artifacts
)

print(f"Model artifacts will be saved to: {sklearn_estimator.output_path}")

In [None]:
# This starts training and returns immediately
sklearn_estimator.fit({'train': S3_TRAIN_DATA_URI}, wait=False)
print("Training job started...")

In [None]:
print(f"Training data URI: {S3_TRAIN_DATA_URI}")
print(f"Model output path: {MODEL_OUTPUT_PATH}")
print(f"Training job name: {sklearn_estimator.latest_training_job.name}")


In [None]:
# Get the current status directly from the estimator's training job object
job_status = sklearn_estimator.latest_training_job.describe()['TrainingJobStatus']
print(f"Current training job status: {job_status}")


In [None]:

# Print the S3 location where the model artifacts will be saved (after job completes)
print(f"Model artifacts will be saved to: {sklearn_estimator.output_path}")
