In [1]:
import pandas as pd
import yaml
import boto3
import sagemaker
from sagemaker.sklearn.processing import ScriptProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput



In [5]:
SETTING_FILE_PATH = '../settings.yaml'
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
    
role = aws_info['aws']['sagemaker']['role']
s3bucket = aws_info['aws']['sagemaker']['s3bucket']
sm = boto3.client('sagemaker')
region = boto3.Session().region_name

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')

ecr_repository = f'ctr-preprocessor-custom:latest'
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository}'
!docker build . -t $image_uri

!aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $account_id.dkr.ecr.$region.amazonaws.com
 

!aws ecr create-repository --repository-name $ecr_repository
 
!docker build -t {ecr_repository} .
!docker tag {ecr_repository} $image_uri
!docker push $image_uri

In [7]:
processing_instance_type = "ml.m5.xlarge"
processing_instance_count = 1
train_valid_split_percentage = 0.8
input_data_s3_uri =  "s3://{}/input/".format(s3bucket)
output_data_s3_uri =  "s3://{}/output/".format(s3bucket)
processing_job_name = "ctr-prediction-custom-preprocessor"

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=image_uri,
    role=role,
    instance_count=processing_instance_count,
    instance_type=processing_instance_type
)

In [None]:
script_processor.run(
    code="custom-processor.py",
    inputs=[ProcessingInput(
                    source=input_data_s3_uri,
                    destination="/opt/ml/processing/input"),
           ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train",
                                    destination=output_data_s3_uri
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/validation",
                                    destination=output_data_s3_uri
                        ),
    ],
     arguments=[
        "--train_valid_split_percentage",
        str(train_valid_split_percentage)],
    wait=True,
    logs=True,
    job_name=processing_job_name,
    experiment_config=None    
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)