In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from corelab.core.session import CoreLabSession
lab_session = CoreLabSession('xgboost', 'customer-churn', default_folder='processing_notebook', create_run_folder=True, aws_profile='sagemaker-role')
lab_session.print()
core_session = lab_session.core_session

In [None]:
# Complete sagemaker-core imports
from sagemaker_core.main.shapes import (
    AppSpecification, 
    ProcessingResources, 
    ProcessingInput, 
    ProcessingClusterConfig,
    ProcessingOutput,
    ProcessingOutputConfig,
    ProcessingStoppingCondition,
    ProcessingS3Input,
    ProcessingS3Output
)
from sagemaker_core.main.resources import ProcessingJob

print("All sagemaker-core modules imported successfully")

In [None]:
import os


def upload_code_and_data():
    print("cwd:", os.getcwd())
    code = lab_session.upload_file('src', 'preprocessing.py', 'prepare-churn-code')
    # Define data and output URIs
    data = f"s3://sagemaker-example-files-prod-{lab_session.region}/datasets/tabular/synthetic/churn.txt"
    output = lab_session.jobs_output_s3_uri

    return code, data, output

upload_code_and_data()

In [None]:
# Create and run the Processing Job using sagemaker-core
code_s3_uri, data_s3_uri, output_s3_uri = upload_code_and_data()

print(f"📂 Code S3 URI: {code_s3_uri}")
print(f"📁 Data S3 URI: {data_s3_uri}")
print(f"📤 Output S3 URI: {output_s3_uri}")

job_name = lab_session.processing_job_name
job = ProcessingJob.create(
    processing_job_name=job_name,
    role_arn=lab_session.role,
    session=lab_session.core_session.boto_session,
    region=lab_session.region,
    app_specification=AppSpecification(
        image_uri=lab_session.retrieve_image('1.7-1'),
        container_entrypoint=["python3", "/opt/ml/processing/input/code/preprocessing.py"],
        container_arguments=["--train-test-split", "0.33"]
    ),
    processing_resources=ProcessingResources(
        cluster_config=ProcessingClusterConfig(
            instance_type="ml.m5.large",
            instance_count=1,
            volume_size_in_gb=30
        )
    ),
    processing_inputs=[
        ProcessingInput(
            input_name="code",
            app_managed=False,
            s3_input=ProcessingS3Input(
                s3_uri=code_s3_uri,
                local_path="/opt/ml/processing/input/code",
                s3_data_type="S3Prefix",
                s3_input_mode="File"
            )
        ),
        ProcessingInput(
            input_name="data",
            app_managed=False,
            s3_input=ProcessingS3Input(
                s3_uri=data_s3_uri,
                local_path="/opt/ml/processing/input/data",
                s3_data_type="S3Prefix",
                s3_input_mode="File"
            )
        )
    ],
    processing_output_config=ProcessingOutputConfig(
        outputs=[
            ProcessingOutput(
                output_name="processed",
                app_managed=False,
                s3_output=ProcessingS3Output(
                    s3_uri=output_s3_uri + '/' + job_name,
                    local_path="/opt/ml/processing/output",
                    s3_upload_mode="EndOfJob"
                )
            )
        ]
    ),
    environment={"PYTHONUNBUFFERED": "1"},
    stopping_condition=ProcessingStoppingCondition(
        max_runtime_in_seconds=3600
    )
)

print(f"🚀 Created Processing Job: {job.processing_job_name}")
print(f"📊 Initial Status: {job.processing_job_status}")

job.wait()

## Alternative: Using XGBoostProcessor from Standard SageMaker SDK

The same processing job can be accomplished using the XGBoostProcessor, which provides better framework integration and dependency management:

In [None]:
# Alternative approach using XGBoostProcessor from standard SageMaker SDK
from sagemaker.xgboost.processing import XGBoostProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Define data locations (reuse from above)
data_s3_uri = f"s3://sagemaker-example-files-prod-{lab_session.region}/datasets/tabular/synthetic/churn.txt"
output_s3_uri = lab_session.jobs_output_s3_uri

print(f"📁 Data S3 URI: {data_s3_uri}")
print(f"📤 Output S3 URI: {output_s3_uri}")

# Create XGBoostProcessor - framework-aware with better dependency handling
xgb_processor = XGBoostProcessor(
    framework_version='1.7-1',
    instance_type="ml.m5.large",
    instance_count=1,
    role=lab_session.role,
    sagemaker_session=lab_session.get_sagemaker_session(),
    volume_size_in_gb=30,
    max_runtime_in_seconds=3600,
    env={"PYTHONUNBUFFERED": "1"}
)

# Run the processing job with framework features
job_name = lab_session.processing_job_name + "-xgb"

xgb_processor.run(
    code="preprocessing.py",         # Entry point script
    source_dir="src/",              # Directory with code and requirements.txt
    job_name=job_name,
    inputs=[
        ProcessingInput(
            source=data_s3_uri,
            destination="/opt/ml/processing/input/data"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="processed",
            source="/opt/ml/processing/output",
            destination=f"{output_s3_uri}/{job_name}"
        )
    ],
    arguments=["--train-test-split", "0.33"],
    wait=True,  # Wait for completion
    logs=True   # Stream logs to notebook
)

print(f"✅ XGBoost Processing job completed: {job_name}")

## Comparison: SageMaker Core vs XGBoostProcessor

**SageMaker Core (ProcessingJob):**
- ✅ More explicit configuration with shapes
- ✅ Fine-grained control over all parameters
- ✅ Type-safe with structured objects
- ✅ Direct API mapping to AWS service
- ❌ More verbose syntax
- ❌ Manual status polling with `.wait()`
- ❌ No automatic dependency installation

**XGBoostProcessor (Standard SDK):**
- ✅ Framework-optimized for XGBoost workflows
- ✅ Automatic requirements.txt installation via `source_dir`
- ✅ Built-in log streaming with `logs=True`
- ✅ Automatic wait with `wait=True`
- ✅ Supports `dependencies` parameter for additional files
- ✅ Same XGBoost image as training (consistency)
- ❌ Framework-specific (not general purpose)

**Key Advantage:** XGBoostProcessor handles dependencies automatically, making it ideal for XGBoost-based data processing workflows where you need additional Python packages.