In [1]:
import os
import pandas as pd
import yaml
import io
import boto3
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import image_uris


pd.set_option('display.max_columns', 50)



In [3]:
import yaml
import sagemaker
import boto3
import pandas as pd

SETTING_FILE_PATH = "../config/settings.yaml"
DATA_FOLDER_PATH = "avazu-ctr-prediction"

with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)
        
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = aws_info['aws']['sagemaker']['region']

sm = boto3.client('sagemaker')
s3 = boto3.client('s3')


In [9]:
df_train = pd.read_csv(os.path.join(DATA_FOLDER_PATH, "train_partial"), dtype="object")
df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=0, shuffle=True)


In [11]:
train_file = "train.csv"
test_file = "test.csv"

df_train.to_csv(train_file, index=False)
df_test.to_csv(test_file, index=False)

prefix = 'custom-script-training'

s3_resource_bucket = boto3.Session().resource("s3").Bucket(bucket)

s3_resource_bucket.Object(os.path.join(prefix, "train", train_file)).upload_file(train_file)
s3_resource_bucket.Object(os.path.join(prefix, "test", test_file)).upload_file(test_file)


In [17]:
output_location = f"s3://{bucket}/{prefix}/output"

s3_train_data = f"s3://{bucket}/{prefix}/train/{train_file}"
s3_test_data = f"s3://{bucket}/{prefix}/test/{test_file}"

In [20]:
from sagemaker.sklearn.estimator import SKLearn

job_name = "custom-script-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.00001, "eta0": 2.0}
enable_local_mode_training=False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "sklearn_script_mode.py",
    "source_dir": "myscript",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)

2022-05-24 15:17:16 Starting - Starting the training job...
2022-05-24 15:17:40 Starting - Preparing the instances for trainingProfilerReport-1653405435: InProgress
.........
2022-05-24 15:19:00 Downloading - Downloading input data...
2022-05-24 15:19:41 Training - Downloading the training image.....[34m2022-05-24 15:20:32,946 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-05-24 15:20:32,951 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-24 15:20:32,968 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-05-24 15:20:33,475 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-24 15:20:33,494 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-24 15:20:33,514 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2

In [21]:
job_name = "custom-library-script-ctr-prediction-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameters = {"alpha": 0.0002, "eta0": 3.0}
enable_local_mode_training=False

if enable_local_mode_training:
    train_instance_type = "local"
    inputs = {"train": f"file://{train_file}", "test": f"file://{test_file}"}
else:
    train_instance_type = "ml.m5.large"
    inputs = {"train": s3_train_data, "test": s3_test_data}

estimator_parameters = {
    "entry_point": "my_library_script_mode.py",
    "source_dir": "myscript",
    "dependencies": ["my_custom_library"],
    "framework_version": "0.23-1",
    "py_version": "py3",
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "output_path": output_location,
    "role": role,
    "base_job_name": job_name,
}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)

2022-05-26 01:11:31 Starting - Starting the training job...
2022-05-26 01:11:58 Starting - Preparing the instances for trainingProfilerReport-1653527491: InProgress
.........
2022-05-26 01:13:21 Downloading - Downloading input data...
2022-05-26 01:14:01 Training - Downloading the training image......
2022-05-26 01:15:01 Training - Training image download completed. Training in progress.[34m2022-05-26 01:14:49,344 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-05-26 01:14:49,347 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-26 01:14:49,364 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-05-26 01:14:49,728 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-26 01:14:49,748 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-05-26 01:14:49,767 sage