In [2]:
import pandas as pd
import yaml
import boto3
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

pd.options.display.notebook_repr_html = True

In [3]:
SETTING_FILE_PATH = 'settings.yaml'
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
    
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
s3bucket = aws_info['aws']['sagemaker']['s3bucket']
region = boto3.Session().region_name
sm = boto3.client('sagemaker')

## データ確認

In [3]:
# S3 からローカルにダウンロード
s3 = boto3.resource('s3')
bucket = s3.Bucket(s3bucket)
bucket.download_file('input/train_partial', 'train_partial')

In [4]:
df = pd.read_csv("train_partial")
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.001579e+19,0,14102100,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,19772,320,50,2227,0,687,100075,48
2,1.002948e+18,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,...,1,0,20596,320,50,2161,0,35,-1,157
3,1.004511e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,...,1,0,19743,320,50,2264,3,427,100000,61
4,1.00599e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15702,320,50,1722,0,35,-1,79


## SKLearnProcessor

In [6]:
processing_instance_type = "ml.t3.medium"
processing_instance_count = 1
train_valid_split_percentage = 0.8
input_data_s3_uri =  "s3://{}/input/".format(s3bucket)
output_data_s3_uri =  "s3://{}/output/".format(s3bucket)
processing_job_name = "ctr-prediction-sklearn-preprocessor"

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_runtime_in_seconds=7200,
)

In [7]:
processor.run(
    code="ctr-prediction-preprocessor.py",
    inputs=[ProcessingInput(
                    source=input_data_s3_uri,
                    destination="/opt/ml/processing/input"),
           ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output/train",
            destination=output_data_s3_uri),
        ProcessingOutput(
            source="/opt/ml/processing/output/validation",
            destination=output_data_s3_uri),
    ],
     arguments=[
        "--train_valid_split_percentage",
        str(train_valid_split_percentage)],
    wait=True,
    logs=True,
    job_name=processing_job_name,
    experiment_config=None   
)

ValueError: code ctr-prediction-preprocessor.py wasn't found. Please make sure that the file exists.
                    