In [1]:
import pandas as pd
import yaml
import boto3
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput



In [3]:
SETTING_FILE_PATH = '../settings.yaml'
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
    
role = aws_info['aws']['sagemaker']['role']
s3bucket = aws_info['aws']['sagemaker']['s3bucket']
sm = boto3.client('sagemaker')

In [6]:
processing_instance_type = "ml.t3.medium"
processing_instance_count = 1
train_valid_split_percentage = 0.8
input_data_s3_uri =  "s3://{}/input/".format(s3bucket)
output_data_s3_uri =  "s3://{}/output/".format(s3bucket)
processing_job_name = "ctr-prediction-sklearn-preprocessor"

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_runtime_in_seconds=7200,
)

In [None]:
res = processor.run(
        code="sklearn-processor.py",
        inputs=[ProcessingInput(
                        source=input_data_s3_uri,
                        destination="/opt/ml/processing/input"),
               ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/train",
                destination=output_data_s3_uri),
            ProcessingOutput(
                source="/opt/ml/processing/output/validation",
                destination=output_data_s3_uri),
        ],
         arguments=[
            "--train_valid_split_percentage",
            str(train_valid_split_percentage)],
        wait=True,
        logs=False,
        job_name=processing_job_name,
        experiment_config=None   
)

In [None]:
jobs = sm.list_processing_jobs()
pd.DataFrame(jobs['ProcessingJobSummaries'])[:1]

In [None]:
processor_description = processor.jobs[-1].describe()
processor_description