In [None]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
import sagemaker
from sagemaker.experiments.run import Run, load_run
import time
from datetime import datetime

sagemaker.__version__

In [None]:
version = "v1"
session = sagemaker.Session()
sm = session.sagemaker_client

experiment_name = "team1-index-predictor-data-processing"
trial_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
trial_name = f"data-processing-{version}-{trial_suffix}"

bucket_name = "team1-index-predictor-bucket"
input_bucket_prefix = "data/raw"
output_bucket_prefix = "data/processed"

processing_instance_type = "ml.t3.medium"
processing_instance_count = 1

In [None]:
with Run(
    experiment_name=experiment_name,
    run_name=trial_name,
    run_display_name=trial_name,
    sagemaker_session=session,
) as run:
    experiment_config = run.experiment_config

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri="492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-scikit-learn:1.0-1-cpu-py3",
    role=sagemaker.get_execution_role(),
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
)

script_processor.run(
    code="../src/data/processor.py",
    inputs=[
        ProcessingInput(
            source=f"s3://{bucket_name}/{input_bucket_prefix}",
            destination="/opt/ml/processing/input",
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=f"s3://{bucket_name}/{output_bucket_prefix}",
        )
    ],
    arguments=[
        "--raw_data_filename",
        "/opt/ml/processing/input/data-{version}.csv",
        "--output_path",
        "/opt/ml/processing/output",
        "--version",
        version,
    ],
    experiment_config=experiment_config,
)