In [1]:
import os
import pandas as pd
import yaml
import io
import boto3
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import image_uris


pd.set_option('display.max_columns', 50)



## AWS設定ファイル・データ読み込み

In [17]:
import yaml
import sagemaker
import boto3

SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "avazu-ctr-prediction"

with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')

prefix = 'built-in-algorithm-training'

df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train"), dtype="object")


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

feature_columns = ['C1', 'banner_pos', 'site_category', 'app_category', 'device_type', 'device_conn_type', 'C15', 'C16', 'C18']

df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=0, shuffle=True)
df_train, df_validation = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

y_train = df_train['click'].to_numpy()
X_train = one_hot_encoder.fit_transform(df_train[feature_columns]).toarray()

y_validation = df_validation['click'].to_numpy()
X_validation = one_hot_encoder.transform(df_validation[feature_columns]).toarray()

y_test = df_test['click'].to_numpy()
X_test = one_hot_encoder.fit_transform(df_test[feature_columns]).toarray()



In [100]:
import numpy as np 
import io
import sagemaker.amazon.common as smac

def upload_protobuf_to_s3(data_type: str, X: np.ndarray, y: np.ndarray) -> None:
    file_name = f'{data_type}.data'
    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, X.astype("float32"), y.astype("float32"))
    f.seek(0)
    
    boto3.Session().resource("s3").Bucket(bucket).Object(
        os.path.join(prefix, data_type, file_name)
    ).upload_fileobj(f)
    
    
upload_protobuf_to_s3('train', X_train, y_train)
upload_protobuf_to_s3('validation', X_validation, y_validation)
upload_protobuf_to_s3('test', X_test, y_test)


In [73]:
train_file = "train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_train.astype("float32"), y_train.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)


In [74]:
validation_file = "validation.data"
s3_validation_data = os.path.join(prefix, "validation", validation_file)

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_validation.astype("float32"), y_validation.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)


In [75]:
test_file = "test.data"
s3_test_data = os.path.join(prefix, "test", test_file)

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_test.astype("float32"), y_test.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
os.path.join(prefix, "test", test_file)
).upload_fileobj(f)


In [105]:
job_name = "built-in-linear-learner-ctr-prediction-sm-sdk" + strftime("%Y%m%d-%H-%M-%S", gmtime())

output_location = f"s3://{bucket}/{prefix}/output"
s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{validation_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"


## SageMaker SDK を用いた組み込みアルゴリズムの実行

In [None]:
from sagemaker import image_uris
import sagemaker


container = image_uris.retrieve(region=region, framework="linear-learner")
print(container)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=output_location,
    sagemaker_session=sess,
)
linear.set_hyperparameters(predictor_type="binary_classifier", mini_batch_size=200)

linear.fit({"train": s3_train_data, "validation": s3_validation_data, "test": s3_test_data}, job_name=job_name)


## AWS SDK を用いた組み込みアルゴリズムの実行

In [84]:
job_name = "built-in-linear-learner-ctr-prediction-aws-sdk" + strftime("%Y%m%d-%H-%M-%S", gmtime())
container = image_uris.retrieve(region=region, framework="linear-learner")


linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": job_name,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train_data,
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation_data,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "test",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_test_data,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": output_location},
    "HyperParameters": {
        "mini_batch_size": "300",
        "predictor_type": "binary_classifier",
        "epochs": "5",
        "num_models": "1",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

In [None]:
sm = boto3.client('sagemaker')
sm.create_training_job(**linear_training_params)

## LinearLerner を用いた組み込みアルゴリズムの実行

In [89]:
train = linear.record_set(X_train.astype('float32'), labels=y_train.astype('float32'), channel='train')
validation = linear.record_set(X_validation.astype('float32'), labels=y_validation.astype('float32'), channel='validation')
test = linear.record_set(X_test.astype('float32'), labels=y_test.astype('float32'), channel='test')


In [107]:
job_name = "built-in-linear-learner-ctr-prediction" + strftime("%Y%m%d-%H-%M-%S", gmtime())

linear = sagemaker.LinearLearner(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.large",
    output_path=output_location,
    predictor_type="binary_classifier",
    sagemaker_session=sess
)

linear.fit([train, validation, test], mini_batch_size=200, wait=False, job_name=job_name)


train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
