# SageMaker Core Pipeline - Data Prep, Training, and Model Creation

This notebook demonstrates how to create a complete ML pipeline using SageMaker Core that includes:
1. Data Processing - Prepare and split the customer churn dataset
2. Model Training - Train an XGBoost model on processed data  
3. Model Creation - Create a deployable SageMaker model from training artifacts

In [None]:
%load_ext autoreload
%autoreload 2

## Initialize CoreLab Session

In [None]:
from corelab.core.session import CoreLabSession

lab_session = CoreLabSession(
    'xgboost', 
    'customer-churn-pipeline', 
    default_folder='pipeline_notebook', 
    create_run_folder=True, 
    aws_profile='sagemaker-role'
)
lab_session.print()
core_session = lab_session.core_session

## Import SageMaker Pipeline Components

Note: SageMaker Pipelines SDK (not sagemaker-core) is used for pipeline orchestration.

In [None]:
# Pipeline-specific imports from SageMaker SDK
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.parameters import (
    ParameterFloat,
    ParameterString
)
from sagemaker.workflow.step_collections import RegisterModel

# Processing imports
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

# Training imports  
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

# Model imports
from sagemaker.model import Model
from sagemaker.model_metrics import ModelMetrics, MetricsSource

import time
from datetime import datetime

print("All pipeline modules imported successfully")

## Upload Processing Script

In [None]:
# Upload the preprocessing script
code_s3_uri = lab_session.upload_file('src', 'preprocessing.py', 'pipeline-code')

# Define data locations
data_s3_uri = f"s3://sagemaker-example-files-prod-{lab_session.region}/datasets/tabular/synthetic/churn.txt"
pipeline_output_s3_uri = lab_session.jobs_output_s3_uri

print(f"📂 Code S3 URI: {code_s3_uri}")
print(f"📁 Data S3 URI: {data_s3_uri}")
print(f"📤 Pipeline Output S3 URI: {pipeline_output_s3_uri}")

## Define Pipeline Parameters

Pipeline parameters allow us to customize pipeline executions without modifying the code.

In [None]:
# Define pipeline parameters for flexibility

# Processing parameters
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.large"
)

train_test_split = ParameterFloat(
    name="TrainTestSplit",
    default_value=0.33
)

# Training parameters
training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.large"
)

max_depth = ParameterString(
    name="MaxDepth",
    default_value="5"
)

num_round = ParameterString(
    name="NumRound",
    default_value="100"
)

print("Pipeline parameters defined")

## Step 1: Define Processing Step

This step processes raw data and splits it into train, validation, and test sets.

In [None]:
# Create a ScriptProcessor
script_processor = ScriptProcessor(
    image_uri=lab_session.retrieve_image('1.7-1'),
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=1,
    role=lab_session.role,
    sagemaker_session=lab_session.core_session
)

# Define the processing step
step_process = ProcessingStep(
    name="PreprocessCustomerChurnData",
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=data_s3_uri,
            destination="/opt/ml/processing/input/data"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=f"{pipeline_output_s3_uri}/data/train"
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation",
            destination=f"{pipeline_output_s3_uri}/data/validation"
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=f"{pipeline_output_s3_uri}/data/test"
        )
    ],
    code=code_s3_uri,
    job_arguments=["--train-test-split", train_test_split]
)

print("✅ Processing step defined")

## Step 2: Define Training Step

This step trains an XGBoost model using the processed data from Step 1.

In [None]:
# Create XGBoost estimator
xgboost_estimator = Estimator(
    image_uri=lab_session.retrieve_image('1.7-1'),
    instance_type=training_instance_type,
    instance_count=1,
    role=lab_session.role,
    output_path=f"{pipeline_output_s3_uri}/models",
    sagemaker_session=lab_session.sagemaker_session,
    hyperparameters={
        "max_depth": max_depth,
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.8",
        "verbosity": "0",
        "objective": "binary:logistic",
        "num_round": num_round
    }
)

# Define training step with dependencies on processing outputs
step_train = TrainingStep(
    name="TrainXGBoostModel",
    estimator=xgboost_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="text/csv"
        )
    }
)

print("✅ Training step defined")

## Step 3: Define Model Creation Step

This step creates a SageMaker Model from the trained model artifacts.

In [None]:
# Create a Model object
model = Model(
    image_uri=lab_session.retrieve_image('1.7-1'),
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=lab_session.role,
    sagemaker_session=lab_session.sagemaker_session
)

# Define model creation step
step_create_model = ModelStep(
    name="CreateXGBoostModel",
    step_args=model.create(
        instance_type="ml.m5.large"
    )
)

print("✅ Model creation step defined")

## Optional: Model Registry Step

Register the model in SageMaker Model Registry for versioning and deployment management.

In [None]:
# Define model metrics (optional)
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=f"{pipeline_output_s3_uri}/evaluation/statistics.json",
        content_type="application/json"
    )
)

# Register model step
step_register_model = RegisterModel(
    name="RegisterXGBoostModel",
    estimator=xgboost_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large", "ml.m5.xlarge"],
    transform_instances=["ml.m5.large"],
    model_package_group_name="customer-churn-models",
    approval_status="PendingManualApproval",
    model_metrics=model_metrics
)

print("✅ Model registration step defined")

## Create and Execute the Pipeline

In [None]:
# Create the pipeline with all steps
pipeline_name = f"customer-churn-pipeline-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        train_test_split,
        training_instance_type,
        max_depth,
        num_round
    ],
    steps=[
        step_process,
        step_train,
        step_create_model,
        # step_register_model  # Uncomment to include model registry
    ],
    sagemaker_session=lab_session.sagemaker_session
)

print(f"🚀 Pipeline Name: {pipeline_name}")
print(f"📊 Pipeline Steps: {len(pipeline.steps)}")

## Validate Pipeline Definition

In [None]:
# Validate the pipeline definition
pipeline_definition = pipeline.definition()
print("Pipeline definition validated successfully!")
print(f"\nPipeline has {len(pipeline_definition['Steps'])} steps:")
for step in pipeline_definition['Steps']:
    print(f"  - {step['Name']}: {step['Type']}")

## Create/Update and Execute Pipeline

In [None]:
# Create or update the pipeline
pipeline.upsert(role_arn=lab_session.role)

print(f"✅ Pipeline '{pipeline_name}' created/updated successfully")

In [None]:
# Start pipeline execution
execution = pipeline.start(
    parameters={
        "ProcessingInstanceType": "ml.m5.large",
        "TrainingInstanceType": "ml.m5.large",
        "TrainTestSplit": 0.33,
        "MaxDepth": "5",
        "NumRound": "100"
    }
)

print("🚀 Pipeline execution started")
print(f"📝 Execution ARN: {execution.arn}")
print(f"📊 Status: {execution.describe()['PipelineExecutionStatus']}")

## Monitor Pipeline Execution

In [None]:
# Monitor execution status

while True:
    status = execution.describe()['PipelineExecutionStatus']
    print(f"Pipeline Status: {status}")
    
    if status in ['Succeeded', 'Failed', 'Stopped']:
        break
        
    # Check step statuses
    steps = execution.list_steps()
    for step in steps:
        print(f"  - {step['StepName']}: {step['StepStatus']}")
    
    time.sleep(30)
    print("---")

print(f"\n✅ Pipeline execution completed with status: {status}")

## Retrieve Pipeline Outputs

In [None]:
# Get execution steps details
execution_steps = execution.list_steps()

for step in execution_steps:
    print(f"\nStep: {step['StepName']}")
    print(f"  Status: {step['StepStatus']}")
    
    if step['StepName'] == 'TrainXGBoostModel' and step['StepStatus'] == 'Succeeded':
        # Get training job details
        training_job_arn = step['Metadata']['TrainingJob']['Arn']
        print(f"  Training Job ARN: {training_job_arn}")
        
    elif step['StepName'] == 'CreateXGBoostModel' and step['StepStatus'] == 'Succeeded':
        # Get model details
        model_arn = step['Metadata']['Model']['Arn']
        print(f"  Model ARN: {model_arn}")

## View Pipeline Execution in SageMaker Studio

You can also view and manage your pipeline execution in SageMaker Studio:
1. Open SageMaker Studio
2. Navigate to the Pipelines section
3. Select your pipeline to view execution details, logs, and metrics

## Clean Up Resources (Optional)

In [None]:
# # Delete the pipeline (uncomment to execute)
# try:
#     pipeline.delete()
#     print(f"✅ Pipeline '{pipeline_name}' deleted")
# except Exception as e:
#     print(f"Error deleting pipeline: {e}")