# Basic SageMaker Processing Script

이 노트북은 다름 블로그에 실린 내용을 바탕으로 만들어졌습니다. [this](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/) 
SageMaker 프로세싱을 사용하여 기차, 테스트 및 유효성 검사 데이터 세트를 생성하는 매우 기본적인 예를 보여줍니다. SageMaker Processing 은 이러한 데이터 세트를 생성하는 데 사용되며, 이 데이터셋은 S3에 다시 기록됩니다.

먼저 SklearnProcessor 객체를 만들어 사용하고자 하는 scikit-learn 버전과 관리형 인프라 요구 사항을 전달합니다.

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)

In [2]:
import pandas as pd

input_data = "s3://sagemaker-sample-data-{}/processing/census/census-income.csv".format(region)
df = pd.read_csv(input_data, nrows=10)
df.to_csv("dataset.csv")

In [5]:
%%writefile preprocessing-basic.py
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Read data locally
input_data_path = os.path.join("/opt/ml/processing/input", "dataset.csv")
# Preprocess the data set
df = pd.read_csv(input_data_path)
downsampled = df
print("shape of data is:")
print(downsampled.shape)
# Split data set into training, validation, and test
train, test = train_test_split(downsampled, test_size=0.2)
train, validation = train_test_split(train, test_size=0.2)
# Create local output directories
try:
    os.makedirs("/opt/ml/processing/output/train")
    os.makedirs("/opt/ml/processing/output/validation")
    os.makedirs("/opt/ml/processing/output/test")
    print("Successfully created directories")
except Exception as e:
    # if the Processing call already creates these directories (or directory otherwise cannot be created)
    print(e)
    print("Could Not Make Directories")
    pass

# Save data locally
try:
    train.to_csv("/opt/ml/processing/output/train/train.csv")
    validation.to_csv("/opt/ml/processing/output/validation/validation.csv")
    test.to_csv("/opt/ml/processing/output/test/test.csv")
    print("Files Successfully Written")
except Exception as e:
    print("Could Not Write the Files")
    print(e)
    pass

print("Finished running processing job")

Writing preprocessing-basic.py


In [6]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code="preprocessing-basic.py",
    # arguments = ['arg1', 'arg2'],
    inputs=[ProcessingInput(source="dataset.csv", destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train"),
        ProcessingOutput(source="/opt/ml/processing/output/validation"),
        ProcessingOutput(source="/opt/ml/processing/output/test"),
    ],
)


Job Name:  sagemaker-scikit-learn-2021-06-01-11-31-40-831
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-806174985048/sagemaker-scikit-learn-2021-06-01-11-31-40-831/input/input-1/dataset.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-806174985048/sagemaker-scikit-learn-2021-06-01-11-31-40-831/input/code/preprocessing-basic.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ap-northeast-2-806174985048/sagemaker-scikit-learn-2021-06-01-11-31-40-831/output/output-1', 'LocalPath': '/opt/ml/proce

### 실습 예제
