# Environment Setup and Configuration


In [None]:
import sagemaker

# Create a SageMaker session
sagemaker_session = sagemaker.Session()
# Get the default S3 bucket
default_bucket = sagemaker_session.default_bucket()
# Retrieve the AWS account ID
account_id = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
# Get the default AWS region where resources will be created
region = sagemaker_session.boto_region_name

# Defining Configuration Constants


In [None]:
# S3 URI for the training dataset
S3_TRAIN_DATA_URI = f"s3://{default_bucket}/datasets/california_housing_train.csv"
# IAM role ARN for SageMaker to access AWS resources
SAGEMAKER_ROLE = f"arn:aws:iam::{account_id}:role/SageMakerDefaultExecution"
# S3 path to store the trained model artifacts
MODEL_OUTPUT_PATH = f"s3://{default_bucket}/models/california-housing/"
# Type of EC2 instance to use for training
INSTANCE_TYPE = "ml.m5.large"
# Number of instances to use for the training job
INSTANCE_COUNT = 1
# Size (in GB) of the EBS volume attached to the training instance(s)
VOLUME_SIZE_GB = 30

# Defining the Training Container Image


In [None]:
# Define the scikit-learn training image URI
sklearn_image = sagemaker.image_uris.retrieve(
    framework="sklearn",         # Specify the ML framework (scikit-learn)
    region=region,               # AWS region for the image
    version="1.2-1",             # scikit-learn version to use
    py_version="py3",            # Python version for the container
    instance_type=INSTANCE_TYPE  # Instance type for compatibility (required by image_uris.retrieve)
)
print("Scikit-learn training image URI:", sklearn_image)


# Configuring Your Training Script


In [None]:
from sagemaker.modules.configs import SourceCode

# Specify the source code configuration
source_code = SourceCode(
    source_dir=".",           # Directory containing the training script
    entry_script="train.py",  # The main script SageMaker will run for training
)


# Setting Up Compute Resources


In [None]:
from sagemaker.modules.configs import Compute

# Define the compute configuration for ModelTrainer
compute_config = Compute(
    instance_type=INSTANCE_TYPE,      # Type of EC2 instance to use for training
    instance_count=INSTANCE_COUNT,    # Number of instances to launch for the training job
    volume_size_in_gb=VOLUME_SIZE_GB  # Size (in GB) of the EBS volume attached to the instance
)


# Configuring Model Output Location


In [None]:
from sagemaker.modules.configs import OutputDataConfig

# Define the output data configuration for ModelTrainer
output_config = OutputDataConfig(
    s3_output_path=MODEL_OUTPUT_PATH
)

# Creating the ModelTrainer Instance


In [None]:
from sagemaker.modules.train import ModelTrainer

# Initialize the ModelTrainer with correct parameters
model_trainer = ModelTrainer(
    training_image=sklearn_image,         # Docker image URI for the scikit-learn training container
    source_code=source_code,              # Source code configuration (directory and entry script)
    base_job_name="sklearn-modeltrainer", # Base name for the SageMaker training job
    role=SAGEMAKER_ROLE,                  # IAM role for SageMaker to access AWS resources
    compute=compute_config,               # Compute configuration (instance type, count, volume size)
    output_data_config=output_config      # Output configuration (S3 path for model artifacts)
)


# Defining Input Data Configuration


In [None]:
from sagemaker.modules.configs import InputData

# Define the input data configuration
input_data = [
    InputData(
        channel_name="train",           # Name of the input channel for training data
        data_source=S3_TRAIN_DATA_URI,  # S3 URI where the training data is stored
    )
]

# Launching the Asynchronous Training Job


In [None]:
# Call train() to start the training job asynchronously (does not block)
model_trainer.train(input_data_config=input_data, wait=False)

# Retrieving Training Job Information


In [None]:
# Access training job information
latest_job = model_trainer._latest_training_job

# Print the training job name and status
print(f"Training job name: {latest_job.training_job_name}")
print(f"Training job status: {latest_job.training_job_status}")

# Retrieving Model Artifacts from Completed Jobs


In [None]:
# List most recent completed training job
training_jobs = sagemaker_session.sagemaker_client.list_training_jobs(
    SortBy='CreationTime',
    SortOrder='Descending',
    StatusEquals='Completed',
    NameContains='sklearn-modeltrainer'  # Filter for ModelTrainer jobs
)

# Extract the name of the latest training job of the list
TRAINING_JOB_NAME = training_jobs['TrainingJobSummaries'][0]['TrainingJobName']

# Path to test data
TEST_DATA_FILE = "data/california_housing_test.csv"

# Since ModelTrainer doesn't have attach() method,
# we use SageMaker Session to get training job details
training_job_details = sagemaker_session.describe_training_job(TRAINING_JOB_NAME)
    
# Get model S3 location
model_s3_uri = training_job_details['ModelArtifacts']['S3ModelArtifacts']