In [1]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
import sagemaker
from sagemaker.experiments.run import Run, load_run
import time
from datetime import datetime

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Fetched defaults config from location: /home/sagemaker-user/.config/sagemaker/config.yaml


'2.219.0'

In [2]:
version = "v1"
session = sagemaker.Session()
sm = session.sagemaker_client

experiment_name = "team1-index-predictor-data-processing"
trial_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
trial_name = f"data-processing-{version}-{trial_suffix}"

bucket_name = "team1-index-predictor-bucket"
input_bucket_prefix = "data/raw"
output_bucket_prefix = "data/processed"

processing_instance_type = "ml.t3.medium"
processing_instance_count = 1

In [3]:
with Run(
    experiment_name=experiment_name,
    run_name=trial_name,
    run_display_name=trial_name,
    sagemaker_session=session,
) as run:
    experiment_config = run.experiment_config

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri="492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-scikit-learn:1.0-1-cpu-py3",
    role=sagemaker.get_execution_role(),
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
)

script_processor.run(
    code="../../src/data/processor.py",
    inputs=[
        ProcessingInput(
            source=f"s3://{bucket_name}/{input_bucket_prefix}/{version}",
            destination="/opt/ml/processing/input",
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=f"s3://{bucket_name}/{output_bucket_prefix}/{version}",
        )
    ],
    arguments=[
        "--mode",
        "feature_store",
        "--raw_data_filename",
        "/opt/ml/processing/input/data.csv",
        "--output_path",
        "/opt/ml/processing/output",
        "--version",
        version,
        "--feature_group_name",
        "index-predictor-feature-group-v7",
        "--region",
        "eu-central-1",
    ],
    experiment_config=experiment_config,
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-06-21-18-33-26-241


.

KeyboardInterrupt: 

In [9]:
!aws s3 ls s3://{bucket_name}/{output_bucket_prefix}/{version}/

2024-06-17 20:23:48       4245 inference.csv
2024-06-15 10:23:07        118 scaler_params.json
2024-06-18 20:18:01     840029 test-v1.csv
2024-06-17 20:23:48     824975 test.csv
2024-06-18 20:18:01   16084923 train-v1.csv
2024-06-17 20:23:47   15796148 train.csv
2024-06-18 20:18:01     839656 validation-v1.csv
2024-06-17 20:23:48     824561 validation.csv
