In [97]:
import os
import pandas as pd
import yaml
import io
import boto3
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import image_uris


pd.set_option('display.max_columns', 50)

In [13]:
SETTING_FILE_PATH = "../settings.yaml"
DATA_FOLDER_PATH = "../avazu-ctr-prediction"

with open("../settings.yaml") as file:
    aws_info = yaml.safe_load(file)

In [12]:
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = boto3.Session().region_name

sm = boto3.client('sagemaker')
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")
s3 = boto3.client('s3')

In [252]:
df_train = pd.read_csv("../avazu-ctr-prediction/train_partial", dtype="object")
df_train = df_train[df_train.index % 10 ==1]

In [253]:
df_train, df_validation = train_test_split(df_train, train_size=0.8, random_state=42)


In [254]:
feature_columns = ['C1', 'banner_pos', 'site_category', 'app_category', 'device_type', 'device_conn_type', 'C15', 'C16', 'C18']


In [255]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

train_y = df_train['click'].to_numpy()
train_X = one_hot_encoder.fit_transform(df_train[feature_columns]).toarray()

validation_y = df_validation['click'].to_numpy()
validation_X = one_hot_encoder.transform(df_validation[feature_columns]).toarray()


In [128]:
prefix = 'trainer'
train_file = "train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)


In [129]:
validation_file = "validation.data"
s3_validation_data = os.path.join(prefix, "validation", validation_file)

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, validation_X.astype("float32"), validation_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)


In [272]:
job_name = "linear-learner-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

output_location = f"s3://{bucket}/{prefix}/output"
s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{validation_file}"


In [139]:
container = image_uris.retrieve(region=region, framework="linear-learner")

linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=output_location,
    sagemaker_session=sess,
)
linear.set_hyperparameters(feature_dim=75, predictor_type="binary_classifier", mini_batch_size=200, epoch=2)

linear.fit({"train": s3_train_data, "validation": s3_validation_data}, job_name=job_name)


2022-05-17 14:55:33 Starting - Starting the training job...
2022-05-17 14:55:57 Starting - Preparing the instances for trainingProfilerReport-1652799333: InProgress
.........
2022-05-17 14:57:17 Downloading - Downloading input data...
2022-05-17 14:57:58 Training - Downloading the training image...
2022-05-17 14:58:37 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/17/2022 14:58:31 INFO 139961235474240] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma'

[34m[2022-05-17 14:58:47.373] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 5, "duration": 6640, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799527.374549, "EndTime": 1652799527.3746436, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 1, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4300748177048582, "count": 1, "min": 0.4300748177048582, "max": 0.4300748177048582}}}[0m
[34m#metrics {"StartTime": 1652799527.374804, "EndTime": 1652799527.374826, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 1, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4302749780690448, "count": 1, "min": 0.4302749780690448, "max": 0.4302749780690448}}}[0m
[34m#metrics {"StartTime": 1652799527.3749518, "EndTime": 1652799527.374968, "Dimensions": {"Algo

[34m[2022-05-17 14:58:55.196] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 7, "duration": 5737, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799535.196217, "EndTime": 1652799535.196426, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 2, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42924630336879943, "count": 1, "min": 0.42924630336879943, "max": 0.42924630336879943}}}[0m
[34m#metrics {"StartTime": 1652799535.1968808, "EndTime": 1652799535.1969116, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 2, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4300062991077115, "count": 1, "min": 0.4300062991077115, "max": 0.4300062991077115}}}[0m
[34m#metrics {"StartTime": 1652799535.197267, "EndTime": 1652799535.197291, "Dimensions": {"A

[34m[2022-05-17 14:59:12.622] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 11, "duration": 6392, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799552.6221077, "EndTime": 1652799552.6221867, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 4, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4288011435988527, "count": 1, "min": 0.4288011435988527, "max": 0.4288011435988527}}}[0m
[34m#metrics {"StartTime": 1652799552.6222692, "EndTime": 1652799552.6222851, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 4, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4297925174310341, "count": 1, "min": 0.4297925174310341, "max": 0.4297925174310341}}}[0m
[34m#metrics {"StartTime": 1652799552.6223233, "EndTime": 1652799552.6223326, "Dimensions": {

[34m[2022-05-17 14:59:21.702] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 13, "duration": 7007, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799561.702946, "EndTime": 1652799561.7030146, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 5, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4286310943461353, "count": 1, "min": 0.4286310943461353, "max": 0.4286310943461353}}}[0m
[34m#metrics {"StartTime": 1652799561.7030847, "EndTime": 1652799561.7030993, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 5, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42977134408417694, "count": 1, "min": 0.42977134408417694, "max": 0.42977134408417694}}}[0m
[34m#metrics {"StartTime": 1652799561.703132, "EndTime": 1652799561.7031417, "Dimensions": 

[34m[2022-05-17 14:59:29.644] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 15, "duration": 5913, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799569.6447923, "EndTime": 1652799569.6448724, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 6, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4284802704568235, "count": 1, "min": 0.4284802704568235, "max": 0.4284802704568235}}}[0m
[34m#metrics {"StartTime": 1652799569.6449487, "EndTime": 1652799569.644964, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 6, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4297479009924468, "count": 1, "min": 0.4297479009924468, "max": 0.4297479009924468}}}[0m
[34m#metrics {"StartTime": 1652799569.645002, "EndTime": 1652799569.645012, "Dimensions": {"Al

[34m[2022-05-17 14:59:37.995] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 17, "duration": 6494, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799577.9954371, "EndTime": 1652799577.995526, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 7, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42834567028543225, "count": 1, "min": 0.42834567028543225, "max": 0.42834567028543225}}}[0m
[34m#metrics {"StartTime": 1652799577.995931, "EndTime": 1652799577.9959605, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 7, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4297263080288905, "count": 1, "min": 0.4297263080288905, "max": 0.4297263080288905}}}[0m
[34m#metrics {"StartTime": 1652799577.996316, "EndTime": 1652799577.9963372, "Dimensions": {

[34m[2022-05-17 14:59:54.376] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 21, "duration": 6279, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799594.3764718, "EndTime": 1652799594.3765726, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 9, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42811967364009124, "count": 1, "min": 0.42811967364009124, "max": 0.42811967364009124}}}[0m
[34m#metrics {"StartTime": 1652799594.3766737, "EndTime": 1652799594.3767219, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 9, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42967798647673233, "count": 1, "min": 0.42967798647673233, "max": 0.42967798647673233}}}[0m
[34m#metrics {"StartTime": 1652799594.3771303, "EndTime": 1652799594.3771603, "Dimensio

[34m[2022-05-17 15:00:02.545] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 23, "duration": 6297, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799602.5457487, "EndTime": 1652799602.545864, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 10, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42802632776106364, "count": 1, "min": 0.42802632776106364, "max": 0.42802632776106364}}}[0m
[34m#metrics {"StartTime": 1652799602.545991, "EndTime": 1652799602.5460117, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 10, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42965176315781495, "count": 1, "min": 0.42965176315781495, "max": 0.42965176315781495}}}[0m
[34m#metrics {"StartTime": 1652799602.5460596, "EndTime": 1652799602.5460708, "Dimensio

[34m[2022-05-17 15:00:12.278] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 25, "duration": 7522, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799612.2787743, "EndTime": 1652799612.2788622, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 11, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4279447878961978, "count": 1, "min": 0.4279447878961978, "max": 0.4279447878961978}}}[0m
[34m#metrics {"StartTime": 1652799612.278945, "EndTime": 1652799612.2789617, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 11, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4296253706653666, "count": 1, "min": 0.4296253706653666, "max": 0.4296253706653666}}}[0m
[34m#metrics {"StartTime": 1652799612.2790072, "EndTime": 1652799612.2790182, "Dimensions": 

[34m[2022-05-17 15:00:28.559] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 29, "duration": 6843, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799628.559675, "EndTime": 1652799628.5597675, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 13, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4278126760447247, "count": 1, "min": 0.4278126760447247, "max": 0.4278126760447247}}}[0m
[34m#metrics {"StartTime": 1652799628.5598571, "EndTime": 1652799628.5598722, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 13, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42957386621036886, "count": 1, "min": 0.42957386621036886, "max": 0.42957386621036886}}}[0m
[34m#metrics {"StartTime": 1652799628.559915, "EndTime": 1652799628.559926, "Dimensions":

[34m[2022-05-17 15:00:36.128] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 31, "duration": 5564, "num_examples": 162, "num_bytes": 11255364}[0m
[34m#metrics {"StartTime": 1652799636.1285133, "EndTime": 1652799636.128618, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 14, "model": 0}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.42775978597795, "count": 1, "min": 0.42775978597795, "max": 0.42775978597795}}}[0m
[34m#metrics {"StartTime": 1652799636.1287255, "EndTime": 1652799636.1289496, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 14, "model": 1}, "Metrics": {"train_binary_classification_cross_entropy_objective": {"sum": 0.4295464667326175, "count": 1, "min": 0.4295464667326175, "max": 0.4295464667326175}}}[0m
[34m#metrics {"StartTime": 1652799636.1292918, "EndTime": 1652799636.1293392, "Dimensions": {"Algo


2022-05-17 15:01:18 Uploading - Uploading generated training model
2022-05-17 15:01:18 Completed - Training job completed
ProfilerReport-1652799333: NoIssuesFound
Training seconds: 234
Billable seconds: 234


In [161]:
linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": job_name,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train_data,
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation_data,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": output_location},
    "HyperParameters": {
        "feature_dim": "75",
        "mini_batch_size": "300",
        "predictor_type": "binary_classifier",
        "epochs": "5",
        "num_models": "1",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

In [162]:
sm.create_training_job(**linear_training_params)

{'TrainingJobArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:training-job/linear-learner-ctr-prediction-20220517-15-26-00',
 'ResponseMetadata': {'RequestId': 'a50b716a-c8e4-41dd-8057-87167eb6bf78',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a50b716a-c8e4-41dd-8057-87167eb6bf78',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '127',
   'date': 'Tue, 17 May 2022 15:26:00 GMT'},
  'RetryAttempts': 0}}

In [167]:
status = sm.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
status

In [273]:
linear = sagemaker.LinearLearner(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.large",
    output_path=output_location,
    predictor_type="binary_classifier",
    sagemaker_session=sess
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [274]:
train = linear.record_set(train_X.astype('float32'), labels=train_y.astype('float32'), channel='train')
validation = linear.record_set(validation_X.astype('float32'), labels=validation_y.astype('float32'), channel='validation')


In [275]:
linear.fit([train, validation], mini_batch_size=200, wait=False, job_name=job_name)



Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


## Script Mode

In [282]:
df_train = pd.read_csv("../avazu-ctr-prediction/train_partial", dtype="object")
df_train = df_train[df_train.index % 10 ==1]
df_train, df_validation = train_test_split(df_train, train_size=0.8, random_state=42)


In [287]:
df_train.to_csv('train.csv', index=False)
df_validation.to_csv('validation.csv', index=False)

In [288]:
prefix = 'trainer_script_mode'
train_file = "train.csv"

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)
s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file('train.csv')


In [289]:
prefix = 'trainer_script_mode'
validation_file = "validation.csv"

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)
s3_resource_bucket.Object(os.path.join(prefix, "validation", validation_file)).upload_file('validation.csv')


In [297]:
job_name = "script-mode-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{validation_file}"


In [302]:
from sagemaker.sklearn.estimator import SKLearn

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training=False
if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_dir}", "test": f"file://{test_dir}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "validation": s3_validation_data}

estimator_parameters = {
    "entry_point": "sklearn_script_mode.py",
    "source_dir": "myscript",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "role": role,
    "base_job_name": job_name,
}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)

2022-05-19 01:46:28 Starting - Starting the training job...
2022-05-19 01:46:52 Starting - Preparing the instances for trainingProfilerReport-1652924787: InProgress
.........
2022-05-19 01:48:17 Downloading - Downloading input data...
2022-05-19 01:48:58 Training - Downloading the training image......
2022-05-19 01:49:53 Training - Training image download completed. Training in progress.[34m2022-05-19 01:49:52,045 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-05-19 01:49:52,049 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 01:49:52,062 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-05-19 01:49:52,480 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 01:49:52,496 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 01:49:52,516 sage

In [303]:

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training=False
if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_dir}", "test": f"file://{test_dir}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "validation": s3_validation_data}

estimator_parameters = {
    "entry_point": "sklearn_script_mode.py",
    "source_dir": "myscript",
    "dependencies": ["my_custom_library"],
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "role": role,
    "base_job_name": job_name,
}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)

2022-05-19 13:26:47 Starting - Starting the training job...
2022-05-19 13:27:04 Starting - Preparing the instances for trainingProfilerReport-1652966803: InProgress
.........
2022-05-19 13:28:40 Downloading - Downloading input data...
2022-05-19 13:29:20 Training - Downloading the training image...
2022-05-19 13:30:01 Training - Training image download completed. Training in progress..[34m2022-05-19 13:30:04,581 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-05-19 13:30:04,587 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 13:30:04,604 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-05-19 13:30:05,020 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 13:30:05,060 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-19 13:30:05,087 sagema

# カスタムコンテナ

In [320]:
%%sh

chmod +x my_custom_container/trainer.py

# Specify an algorithm name
algorithm_name=ctr-prediction-custom-container

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
# region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"
echo $fullname
# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} -f my_custom_container/Dockerfile .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-custom-container:latest
Login Succeeded
The push refers to repository [547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-custom-container]
c3c5d90ac7c2: Preparing
cd282fc76b3a: Preparing
e4fda2003244: Preparing
da722eec4a58: Preparing
17f8ed900284: Preparing
b9332077cac5: Preparing
e34cf1ca2414: Preparing
a51c7c8eb570: Preparing
6be90f1a2d3f: Preparing
b9332077cac5: Waiting
e34cf1ca2414: Waiting
a51c7c8eb570: Waiting
6be90f1a2d3f: Waiting
da722eec4a58: Layer already exists
17f8ed900284: Layer already exists
b9332077cac5: Layer already exists
e34cf1ca2414: Layer already exists
a51c7c8eb570: Layer already exists
6be90f1a2d3f: Layer already exists
c3c5d90ac7c2: Pushed
cd282fc76b3a: Pushed
e4fda2003244: Pushed
latest: digest: sha256:859c9f6618766913c31e32115ef60ee91bdef97493fe8eeb85646dffda17f82c size: 2215


#1 [internal] load build definition from Dockerfile
#1 sha256:40a5e41ba93d7f6d2bb8a550f7a3f522717fb2f1f79cfc3340c15bca30ed1934
#1 transferring dockerfile: 496B 0.0s done
#1 DONE 0.0s

#2 [internal] load .dockerignore
#2 sha256:4284d65fc32c9dc0c96e34e2558a755c21784a0c8b970c592a9f9e80e3f64f2e
#2 transferring context: 2B done
#2 DONE 0.0s

#3 [internal] load metadata for docker.io/library/python:3.8-slim-buster
#3 sha256:a82ddfb0a3c3ab3f4e2ebc7582cec39f26df7d1ae41d54f70ea9fe596d7b25c7
#3 ...

#4 [auth] library/python:pull token for registry-1.docker.io
#4 sha256:cd2a5aa972c1ff1a775a4871d830a7ed079fae10cfa9f212eb5d56abee1f2bc6
#4 DONE 0.0s

#3 [internal] load metadata for docker.io/library/python:3.8-slim-buster
#3 sha256:a82ddfb0a3c3ab3f4e2ebc7582cec39f26df7d1ae41d54f70ea9fe596d7b25c7
#3 DONE 1.9s

#5 [1/5] FROM docker.io/library/python:3.8-slim-buster@sha256:234da35e659c02a785a1e9d2002e386bea80f293572f90ad0cf668e8f9084078
#5 sha256:cf4565bb4397b979fc3e1a1eb2630859e81b76a087949de99730973f

In [321]:
from sagemaker.estimator import Estimator

job_name = "script-mode-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training=False
if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_dir}", "test": f"file://{test_dir}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "validation": s3_validation_data}
    
estimator = Estimator(image_uri='547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-custom-container:latest',
                      role=role,
                      instance_count=1,
                      instance_type=train_instance_type,
                      hyperparameters=hyperparameters,
                     base_job_name=job_name)

estimator.fit(inputs)

2022-05-20 01:44:12 Starting - Starting the training job...
2022-05-20 01:44:35 Starting - Preparing the instances for trainingProfilerReport-1653011051: InProgress
.........
2022-05-20 01:46:10 Downloading - Downloading input data
2022-05-20 01:46:10 Training - Training image download completed. Training in progress..[34m2022-05-20 01:46:11,999 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-20 01:46:12,028 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-20 01:46:12,049 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-20 01:46:12,067 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "train": "/opt/ml/input/data/train",
        "validation": "/opt/ml/input/data/validation"
    },
    "current_host": "algo-1",
    "

# training-toolkitを使わない

In [336]:
%%sh

chmod +x my_scratch_container/trainer.py

# Specify an algorithm name
algorithm_name=ctr-prediction-scratch-container

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
# region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"
echo $fullname
# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} -f my_scratch_container/Dockerfile .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-scratch-container:latest
Login Succeeded
The push refers to repository [547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-scratch-container]
e028bf015659: Preparing
e4fda2003244: Preparing
da722eec4a58: Preparing
17f8ed900284: Preparing
b9332077cac5: Preparing
e34cf1ca2414: Preparing
a51c7c8eb570: Preparing
6be90f1a2d3f: Preparing
e34cf1ca2414: Waiting
a51c7c8eb570: Waiting
6be90f1a2d3f: Waiting
e4fda2003244: Layer already exists
17f8ed900284: Layer already exists
b9332077cac5: Layer already exists
da722eec4a58: Layer already exists
e34cf1ca2414: Layer already exists
6be90f1a2d3f: Layer already exists
a51c7c8eb570: Layer already exists
e028bf015659: Pushed
latest: digest: sha256:faca32ebf725ab485756a60ef6601761e4e3142db856c78709dfd815af63d9b1 size: 2003


#1 [internal] load build definition from Dockerfile
#1 sha256:354ea2c4118967202a2fc111400db2b1eee8c2860aa4b531fd926e1772a97937
#1 transferring dockerfile: 37B done
#1 DONE 0.0s

#2 [internal] load .dockerignore
#2 sha256:48a3a93fa3bd88f35e9ea29d348365ce427a4a794c36b004f3f14d4dbd22240a
#2 transferring context: 2B done
#2 DONE 0.0s

#3 [internal] load metadata for docker.io/library/python:3.8-slim-buster
#3 sha256:a82ddfb0a3c3ab3f4e2ebc7582cec39f26df7d1ae41d54f70ea9fe596d7b25c7
#3 ...

#4 [auth] library/python:pull token for registry-1.docker.io
#4 sha256:5f87b4b725047833b4d067a58710032b865b43ce29aedb02d5c5c4f75355f451
#4 DONE 0.0s

#3 [internal] load metadata for docker.io/library/python:3.8-slim-buster
#3 sha256:a82ddfb0a3c3ab3f4e2ebc7582cec39f26df7d1ae41d54f70ea9fe596d7b25c7
#3 DONE 1.9s

#5 [1/5] FROM docker.io/library/python:3.8-slim-buster@sha256:234da35e659c02a785a1e9d2002e386bea80f293572f90ad0cf668e8f9084078
#5 sha256:cf4565bb4397b979fc3e1a1eb2630859e81b76a087949de99730973f2846ce

In [337]:
from sagemaker.estimator import Estimator

job_name = "scratch-container-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training=True
if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://local_data/train", "validation": f"file://local_data/validation"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "validation": s3_validation_data}
    
estimator = Estimator(image_uri='547760918250.dkr.ecr.ap-northeast-1.amazonaws.com/ctr-prediction-scratch-container:latest',
                      role=role,
                      instance_count=1,
                      instance_type=train_instance_type,
                      hyperparameters=hyperparameters,
                     base_job_name=job_name)

train_config = sagemaker.session.s3_input(
    "s3://ctr-prediction/trainer_script_mode/train/", content_type="text/csv"
)
val_config = sagemaker.session.s3_input(
    "s3://ctr-prediction/trainer_script_mode/validation/", content_type="text/csv"
)

estimator.fit(inputs)
# estimator.fit({"train": train_config, "validation": val_config})


The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Creating 2gzxrk4wxs-algo-1-xdqps ... 
Creating 2gzxrk4wxs-algo-1-xdqps ... done
Docker Compose is now in the Docker CLI, try `docker compose up`

Attaching to 2gzxrk4wxs-algo-1-xdqps
[36m2gzxrk4wxs-algo-1-xdqps |[0m 
[36m2gzxrk4wxs-algo-1-xdqps |[0m Running training...
[36m2gzxrk4wxs-algo-1-xdqps |[0m {'alpha': '1e-05', 'eta0': '2.0'}
[36m2gzxrk4wxs-algo-1-xdqps |[0m 
[36m2gzxrk4wxs-algo-1-xdqps |[0m Hyperparameters configuration:
[36m2gzxrk4wxs-algo-1-xdqps |[0m 
[36m2gzxrk4wxs-algo-1-xdqps |[0m Input data configuration:
[36m2gzxrk4wxs-algo-1-xdqps |[0m {'train': {'TrainingInputMode': 'File'}, 'validation': {'TrainingInputMode': 'File'}}
[36m2gzxrk4wxs-algo-1-xdqps |[0m 
[36m2gzxrk4wxs-algo-1-xdqps |[0m List of files in train channel: 
[36m2gzxrk4wxs-algo-1-xdqps |[0m /opt/ml/input/data/train/train.csv
[36m2gzxrk4wxs-algo-1-xdqps |[0m 
[36m2gzxrk4wxs-algo-1-xdqps |[0m List of files in validation channel: 
[36m2gzxrk4wxs-algo-1-xdqps |[0m /opt/ml/input/data/

In [329]:
s3_train_data

's3://ctr-prediction/trainer_script_mode/train/train.csv'

In [330]:
s3_validation_data

's3://ctr-prediction/trainer_script_mode/validation/validation.csv'