### Single job for data collector, later switched to pipeline, could be used for debugging outside of the pipeline

In [13]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
import sagemaker
from sagemaker.experiments.run import Run, load_run
import time
from datetime import datetime
from sagemaker import image_uris

sagemaker.__version__

'2.219.0'

In [14]:
version = "v1"
session = sagemaker.Session()
sm = session.sagemaker_client

experiment_name = "team1-index-predictor-data-collection"
trial_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
trial_name = f"data-collection-{version}-{trial_suffix}"

bucket_name = "team1-index-predictor-bucket"
bucket_prefix = "data/raw"
data_s3_url = f"s3://{bucket_name}/{bucket_prefix}/{version}"

processing_instance_type = "ml.t3.medium"
processing_instance_count = 1

In [15]:
with Run(
    experiment_name=experiment_name,
    run_name=trial_name,
    run_display_name=trial_name,
    sagemaker_session=session,
) as run:
    experiment_config = run.experiment_config

sklearn_image_uri = image_uris.retrieve(
    framework="sklearn", region=session.boto_region_name, version="1.2-1"
)

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=sklearn_image_uri,
    role=sagemaker.get_execution_role(),
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
)

script_processor.run(
    code="../../src/data/collector.py",
    inputs=[],
    outputs=[
        ProcessingOutput(
            source=f"/opt/ml/processing/output/{bucket_prefix}",
            destination=data_s3_url,
            output_name="raw_data",
        )
    ],
    arguments=[
        "--mode",
        "train-val-test",
        "--version",
        version,
    ],
    experiment_config=experiment_config,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-06-20-19-44-48-507


.......................................................................................[34mCollecting yfinance
  Downloading yfinance-0.2.40-py2.py3-none-any.whl.metadata (11 kB)[0m
[34mCollecting pandas>=1.3.0 (from yfinance)
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)[0m
[34mCollecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)[0m
[34mCollecting lxml>=4.9.1 (from yfinance)
  Downloading lxml-5.2.2-cp38-cp38-manylinux_2_28_x86_64.whl.metadata (3.4 kB)[0m
[34mCollecting platformdirs>=2.0.0 (from yfinance)
  Downloading platformdirs-4.2.2-py3-none-any.whl.metadata (11 kB)[0m
[34mCollecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)[0m
[34mCollecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.5.tar.gz (3.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [34]:
!aws logs describe-log-streams --log-group-name /aws/sagemaker/ProcessingJobs --log-stream-name-prefix sagemaker-scikit-learn-2024-06-18-09-11-15-124

{
    "logStreams": [
        {
            "logStreamName": "sagemaker-scikit-learn-2024-06-18-09-11-15-124/algo-1-1718702397",
            "creationTime": 1718702708789,
            "firstEventTimestamp": 1718702702294,
            "lastEventTimestamp": 1718702702294,
            "lastIngestionTime": 1718702708843,
            "uploadSequenceToken": "49039859592256200492953849519668990787830740764849091222",
            "arn": "arn:aws:logs:eu-central-1:567821811420:log-group:/aws/sagemaker/ProcessingJobs:log-stream:sagemaker-scikit-learn-2024-06-18-09-11-15-124/algo-1-1718702397",
            "storedBytes": 0
        }
    ]
}


In [35]:
!aws logs get-log-events --log-group-name /aws/sagemaker/ProcessingJobs --log-stream-name sagemaker-scikit-learn-2024-06-18-09-11-15-124/algo-1-1718702397

{
    "events": [
        {
            "timestamp": 1718702702294,
            "message": "Collecting yfinance\n  Downloading yfinance-0.2.40-py2.py3-none-any.whl.metadata (11 kB)",
            "ingestionTime": 1718702708843
        },
        {
            "timestamp": 1718702702294,
            "message": "Collecting pandas>=1.3.0 (from yfinance)\n  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)",
            "ingestionTime": 1718702708843
        },
        {
            "timestamp": 1718702702294,
            "ingestionTime": 1718702708843
        },
        {
            "timestamp": 1718702702294,
            "ingestionTime": 1718702708843
        },
        {
            "timestamp": 1718702702294,
            "message": "Collecting multitasking>=0.0.7 (from yfinance)\n  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)",
            "ingestionTime": 1718702708843
        },
        {
            "timestamp": 