In [1]:
import os
import pandas as pd
import yaml
import io
import boto3
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import image_uris


pd.set_option('display.max_columns', 50)



In [17]:
import yaml
import sagemaker
import boto3

SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "avazu-ctr-prediction"

with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')

prefix = 'built-in-algorithm-training'

df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")


In [62]:
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")
df_train = df_train[df_train.index % 5 ==1]
df_train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
1,1.001579499778512e+19,0,14102100,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,4375586d,5ec45883,1,0,19772,320,50,2227,0,687,100075,48
6,1.008682414352518e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,0f2161f8,4c362c9f,42951c8d,e981565c,1,0,20633,320,50,2374,3,39,-1,23
11,1.0159992652263672e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,0489ce3f,e9b8d8d7,1,0,15705,320,50,1722,0,35,-1,79
16,1.0232234924787683e+19,0,14102100,1005,1,11944c42,1a02dd86,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,4b2128ef,900981af,1,2,20596,320,50,2161,0,35,-1,157
21,1.030183409073314e+19,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,c078a9f4,c6263d8a,1,0,15702,320,50,1722,0,35,-1,79


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

feature_columns = ['C1', 'banner_pos', 'site_category', 'app_category', 'device_type', 'device_conn_type', 'C15', 'C16', 'C18']

df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=0, shuffle=True)
df_train, df_validation = train_test_split(df_train, train_size=0.8, random_state=0, shuffle=True)

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

y_train = df_train['click'].to_numpy()
X_train = one_hot_encoder.fit_transform(df_train[feature_columns]).toarray()

y_validation = df_validation['click'].to_numpy()
X_validation = one_hot_encoder.transform(df_validation[feature_columns]).toarray()

y_test = df_test['click'].to_numpy()
X_test = one_hot_encoder.fit_transform(df_test[feature_columns]).toarray()



In [72]:
X_train.shape, X_validation.shape, X_test.shape

((5088, 60), (1272, 60), (2726, 60))

In [73]:
train_file = "train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_train.astype("float32"), y_train.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)


In [74]:
validation_file = "validation.data"
s3_validation_data = os.path.join(prefix, "validation", validation_file)

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_validation.astype("float32"), y_validation.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)


In [75]:
test_file = "test.data"
s3_test_data = os.path.join(prefix, "test", test_file)

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_test.astype("float32"), y_test.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
os.path.join(prefix, "test", test_file)
).upload_fileobj(f)


In [80]:
job_name = "built-in-linear-learner-ctr-prediction-sm-sdk" + strftime("%Y%m%d-%H-%M-%S", gmtime())

output_location = f"s3://{bucket}/{prefix}/output"
s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{validation_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"


In [81]:
from sagemaker import image_uris
import sagemaker


container = image_uris.retrieve(region=region, framework="linear-learner")

linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=output_location,
    sagemaker_session=sess,
)
linear.set_hyperparameters(predictor_type="binary_classifier", mini_batch_size=200)

linear.fit({"train": s3_train_data, "validation": s3_validation_data, "test": s3_test_data}, job_name=job_name)


2022-05-24 00:45:51 Starting - Starting the training job...
2022-05-24 00:46:14 Starting - Preparing the instances for trainingProfilerReport-1653353150: InProgress
.........
2022-05-24 00:47:35 Downloading - Downloading input data...
2022-05-24 00:48:19 Training - Downloading the training image...
2022-05-24 00:48:45 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/24/2022 00:48:52 INFO 139670839265088] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma

[34m[2022-05-24 00:49:01.927] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/validation", "epoch": 11, "duration": 521, "num_examples": 7, "num_bytes": 366336}[0m
[34m#metrics {"StartTime": 1653353342.3709505, "EndTime": 1653353342.3710315, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 3, "model": 0}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.41999681340823386, "count": 1, "min": 0.41999681340823386, "max": 0.41999681340823386}}}[0m
[34m#metrics {"StartTime": 1653353342.3711243, "EndTime": 1653353342.3711593, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 3, "model": 1}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.4195090779718363, "count": 1, "min": 0.4195090779718363, "max": 0.4195090779718363}}}[0m
[34m#metrics {"StartTime": 1653353342.3712018, "EndTime": 1653353342.371231, "Di

[34m#metrics {"StartTime": 1653353352.2125988, "EndTime": 1653353352.212684, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 8, "model": 0}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.4185301282870694, "count": 1, "min": 0.4185301282870694, "max": 0.4185301282870694}}}[0m
[34m#metrics {"StartTime": 1653353352.2131653, "EndTime": 1653353352.2132103, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 8, "model": 1}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.4183990130634428, "count": 1, "min": 0.4183990130634428, "max": 0.4183990130634428}}}[0m
[34m#metrics {"StartTime": 1653353352.2135525, "EndTime": 1653353352.2135975, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 8, "model": 2}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum"

[34m#metrics {"StartTime": 1653353362.2999187, "EndTime": 1653353362.3000004, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 13, "model": 0}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.4181612542590255, "count": 1, "min": 0.4181612542590255, "max": 0.4181612542590255}}}[0m
[34m#metrics {"StartTime": 1653353362.3001664, "EndTime": 1653353362.3002977, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 13, "model": 1}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"sum": 0.4180955796871545, "count": 1, "min": 0.4180955796871545, "max": 0.4180955796871545}}}[0m
[34m#metrics {"StartTime": 1653353362.3005476, "EndTime": 1653353362.300571, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 13, "model": 2}, "Metrics": {"validation_binary_classification_cross_entropy_objective": {"s


2022-05-24 00:49:41 Uploading - Uploading generated training model
2022-05-24 00:49:41 Completed - Training job completed
Training seconds: 133
Billable seconds: 133


In [84]:
job_name = "built-in-linear-learner-ctr-prediction-aws-sdk" + strftime("%Y%m%d-%H-%M-%S", gmtime())
container = image_uris.retrieve(region=region, framework="linear-learner")


linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": job_name,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train_data,
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation_data,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "test",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_test_data,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": output_location},
    "HyperParameters": {
        "mini_batch_size": "300",
        "predictor_type": "binary_classifier",
        "epochs": "5",
        "num_models": "1",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

In [85]:
sm = boto3.client('sagemaker')
sm.create_training_job(**linear_training_params)

{'TrainingJobArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:training-job/built-in-linear-learner-ctr-prediction-aws-sdk20220524-00-58-30',
 'ResponseMetadata': {'RequestId': '496ae18b-11be-4dac-a336-5aae59565e8f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '496ae18b-11be-4dac-a336-5aae59565e8f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '143',
   'date': 'Tue, 24 May 2022 00:58:31 GMT'},
  'RetryAttempts': 0}}

In [89]:
train = linear.record_set(X_train.astype('float32'), labels=y_train.astype('float32'), channel='train')
validation = linear.record_set(X_validation.astype('float32'), labels=y_validation.astype('float32'), channel='validation')
test = linear.record_set(X_test.astype('float32'), labels=y_test.astype('float32'), channel='test')


In [92]:
job_name = "built-in-linear-learner-ctr-prediction" + strftime("%Y%m%d-%H-%M-%S", gmtime())

linear = sagemaker.LinearLearner(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.large",
    output_path=output_location,
    predictor_type="binary_classifier",
    sagemaker_session=sess
)

linear.fit([train, validation, test], mini_batch_size=200, wait=False, job_name=job_name)


Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
