In [3]:
import os
import yaml
import sagemaker
import boto3
import pandas as pd

from sklearn.model_selection import train_test_split


SETTING_FILE_PATH = "../../config/settings.yaml"
DATA_FOLDER_PATH = "../avazu-ctr-prediction"

# AWS リソース設定
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')


In [4]:
# train, validation, test データを用意
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train"), dtype="object")
df_train, df_test = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)
df_train, df_validation = train_test_split(df_train, train_size=0.7, random_state=0, shuffle=True)


In [None]:
# local mode用にローカル環境にデータを保存
df_train.to_csv('train.csv', index=False)
df_validation.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [5]:
# S3にアップロード
prefix = 'custom-container-training'

train_file = "train.csv"
validation_file = "validation.csv"
test_file = "test.csv"

df_train.to_csv(train_file, index=False)
df_validation.to_csv(validation_file, index=False)
df_test.to_csv(test_file, index=False)

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)

s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file(train_file)
s3_resource_bucket.Object(os.path.join(prefix, "validation", validation_file)).upload_file(validation_file)
s3_resource_bucket.Object(os.path.join(prefix, "test", test_file)).upload_file(test_file)


In [6]:
output_location = f"s3://{bucket}/{prefix}/output"

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{validation_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"