In [1]:
import os
import boto3
import re
import sagemaker
import pandas as pd
import numpy as np
from pathlib import Path
from sagemaker.image_uris import retrieve
from time import gmtime, strftime
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

## Setting params

In [2]:
params_cols = ["exp", "imp", "imae", "crude_oil_brent", "gas_henry_hub", "treasury3month", "yuan_dollar_spot_tc", "year", "month"]
params_trainperc = 0.7
# S3 bucket for training data.
# Feel free to specify a different bucket and prefix.
data_bucket = f"x4data"
data_prefix = "linear_regresion"

## Preparing the data

Reading the data

In [3]:
data = pd.read_csv(Path.cwd().parent.joinpath("input", "data4x_log.csv"))
data.head()

Unnamed: 0,date,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
0,1999-01-01,6.270156,6.29098,52.19728,11.1145,1.849474,4.335263,8.278921,1999,1
1,1999-02-01,6.349934,6.274894,48.747353,10.271579,1.771053,4.438421,8.278137,1999,2
2,1999-03-01,6.37251,6.396669,50.386924,12.511304,1.791739,4.44087,8.279152,1999,3
3,1999-04-01,6.44368,6.1927,47.016367,15.2945,2.15,4.289545,8.279186,1999,4
4,1999-05-01,6.317601,6.250723,48.617007,15.225263,2.26,4.4985,8.278535,1999,5


Viewing Na's values

In [4]:
data.isna().sum()

date                   0
exp                    2
imp                    2
imae                   2
crude_oil_brent        0
gas_henry_hub          0
treasury3month         0
yuan_dollar_spot_tc    0
year                   0
month                  0
dtype: int64

We remove the two last rows observations because we have incomplete data

In [5]:
data2 = data[:(len(data)-2)]
data3 = data2.copy()
data2 = data2.drop("date", axis=1)

In [6]:
data2.isna().sum()

exp                    0
imp                    0
imae                   0
crude_oil_brent        0
gas_henry_hub          0
treasury3month         0
yuan_dollar_spot_tc    0
year                   0
month                  0
dtype: int64

Let's split the data into training and test datasets

In [7]:
rows, train_rows = data2.shape[0], np.ceil(data2.shape[0]*params_trainperc),
val_rows, test_rows = np.ceil((rows-train_rows)/2), data2.shape[0] - (np.ceil((rows-train_rows)/2) + np.ceil(data2.shape[0]*params_trainperc))
print(
      f"The dataset has {rows} rows"
      f"\n"
      f"The train data has {train_rows} rows and ends with {data3.loc[train_rows, ['date']].to_numpy()[0]}"
      f"\n"
      f"The validation data has {val_rows} rows and ends with {data3.loc[(train_rows+test_rows)-1, ['date']].to_numpy()[0]}"
      f"\n"
      f"The test data has {test_rows} rows and ends with {data3.loc[(train_rows+test_rows+val_rows)-1, ['date']].to_numpy()[0]}"
     )

The dataset has 283 rows
The train data has 199.0 rows and ends with 2015-08-01
The validation data has 42.0 rows and ends with 2019-01-01
The test data has 42.0 rows and ends with 2022-07-01


In [8]:
data2.loc[:train_rows].tail()

Unnamed: 0,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
195,6.660942,7.128043,88.503044,59.524286,2.609048,0.023182,6.200995,2015,4
196,6.741396,7.086064,92.093511,64.075,2.85,0.0165,6.20345,2015,5
197,6.702157,7.070219,92.533938,61.477727,2.783636,0.015,6.2052,2015,6
198,6.63345,7.218925,93.839073,56.561304,2.839565,0.032273,6.20845,2015,7
199,6.575134,7.117701,92.74642,46.515,2.77381,0.071905,6.338252,2015,8


In [9]:
data2.loc[train_rows+1:(train_rows+val_rows)].head()

Unnamed: 0,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
200,6.559553,7.074705,93.779577,47.623182,2.660909,0.022381,6.367605,2015,9
201,6.656148,7.17272,96.666006,48.43,2.340909,0.015238,6.3505,2015,10
202,6.656873,7.185846,98.425855,44.267619,2.092381,0.124737,6.364037,2015,11
203,6.586283,7.101069,97.873533,38.005455,1.929565,0.227727,6.449095,2015,12
204,6.563484,7.05842,94.535681,30.6995,2.282381,0.255263,6.5726,2016,1


In [10]:
data2.loc[train_rows+1:(train_rows+val_rows)].tail()

Unnamed: 0,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
237,6.907252,7.357293,106.056533,81.032174,3.275217,2.249091,6.919105,2018,10
238,6.853132,7.330153,108.845482,64.748182,4.091,2.3265,6.936675,2018,11
239,6.747685,7.146274,107.932415,57.362353,4.041667,2.365789,6.883711,2018,12
240,6.714029,7.206761,101.343809,59.409545,3.108571,2.374286,6.786345,2019,1
241,6.789402,7.086702,101.785017,63.9605,2.692632,2.388421,6.736683,2019,2


In [11]:
data2.loc[(train_rows+val_rows)+1:(train_rows+test_rows+val_rows)].head()

Unnamed: 0,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
242,6.952544,7.267086,105.795411,66.138571,2.948571,2.402381,6.711943,2019,3
243,6.859415,7.181361,99.841015,71.233333,2.647143,2.382857,6.716068,2019,4
244,6.964854,7.24037,103.663481,71.317727,2.639091,2.352727,6.851859,2019,5
245,6.874514,7.115993,103.546382,64.2205,2.3985,2.1705,6.897675,2019,6
246,6.827391,7.205442,102.411182,63.91913,2.36619,2.098636,6.877473,2019,7


In [12]:
data2.loc[(train_rows+val_rows)+1:(train_rows+test_rows+val_rows)].head()

Unnamed: 0,exp,imp,imae,crude_oil_brent,gas_henry_hub,treasury3month,yuan_dollar_spot_tc,year,month
242,6.952544,7.267086,105.795411,66.138571,2.948571,2.402381,6.711943,2019,3
243,6.859415,7.181361,99.841015,71.233333,2.647143,2.382857,6.716068,2019,4
244,6.964854,7.24037,103.663481,71.317727,2.639091,2.352727,6.851859,2019,5
245,6.874514,7.115993,103.546382,64.2205,2.3985,2.1705,6.897675,2019,6
246,6.827391,7.205442,102.411182,63.91913,2.36619,2.098636,6.877473,2019,7


Saving the datasets

In [13]:
with Path.cwd().parent.joinpath("input", "data4x_log_train.csv") as e:
    data2.loc[:train_rows, params_cols].to_csv(e, sep=',', header=False, index=False)
with Path.cwd().parent.joinpath("input", "data4x_log_validation.csv") as e:
    data2.loc[train_rows+1:(train_rows+val_rows), params_cols].to_csv(e, sep=',', header=False, index=False)
with Path.cwd().parent.joinpath("input", "data4x_log_test.csv") as e:
    data2.loc[(train_rows+val_rows)+1:(train_rows+test_rows+val_rows), params_cols].to_csv(e, sep=',', header=False, index=False)

In [14]:
data2.loc[train_rows+1:(train_rows+val_rows)].dtypes

exp                    float64
imp                    float64
imae                   float64
crude_oil_brent        float64
gas_henry_hub          float64
treasury3month         float64
yuan_dollar_spot_tc    float64
year                     int64
month                    int64
dtype: object

Upload the data to the s3

In [15]:
s3 = boto3.Session().resource("s3")

file_train = "data4x_log_train.csv"
file_validation = "data4x_log_validation.csv"
file_test = "data4x_log_test.csv"

file_train_full = str(Path.cwd().parent.joinpath("input", file_train))
file_validation_full = str(Path.cwd().parent.joinpath("input", file_validation))
file_test_full = str(Path.cwd().parent.joinpath("input", file_test))

# downloading the train, test, and validation files from data_bucket
with open(file_train_full,'rb') as f:
    s3.Bucket(data_bucket).Object(f"{data_prefix}/train_csv/{file_train}").upload_fileobj(f)
with open(file_validation_full, 'rb') as f:
    s3.Bucket(data_bucket).Object(f"{data_prefix}/validation_csv/{file_validation}").upload_fileobj(f)
with open(file_test_full,'rb') as f:
    s3.Bucket(data_bucket).Object(f"{data_prefix}/test_csv/{file_test}").upload_fileobj(f)

Creating the chanels, the test will be used in inference

In [16]:
# creating the inputs for the fit() function with the training and validation location
s3_train_data = f"s3://{data_bucket}/{data_prefix}/train_csv/{file_train}"
print(f"training files will be taken from: {s3_train_data}")
s3_validation_data = f"s3://{data_bucket}/{data_prefix}/validation_csv/{file_validation}"
print(f"validation files will be taken from: {s3_validation_data}")
output_location = f"s3://{data_bucket}/{data_prefix}/model/output"
print(f"training artifacts output location: {output_location}")

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

training files will be taken from: s3://x4data/linear_regresion/train_csv/data4x_log_train.csv
validation files will be taken from: s3://x4data/linear_regresion/validation_csv/data4x_log_validation.csv
training artifacts output location: s3://x4data/linear_regresion/model/output


## Training model

Set the container

In [17]:
container = retrieve("linear-learner", boto3.Session().region_name, version="1")
print(container)

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


Set the estimator and the hyperparameters

In [18]:
%%time

sess = sagemaker.Session()

job_name = "linear-learner-x" + strftime("%Y%m%d-%H-%M-%S", gmtime())
print("Training job", job_name)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode="File",
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=output_location,
    sagemaker_session=sess,
)

linear.set_hyperparameters(
    feature_dim=8,
    epochs=16,
    wd=0.01,
    loss="absolute_loss",
    predictor_type="regressor",
    normalize_data=True,
    optimizer="adam",
    mini_batch_size=100,
    lr_scheduler_step=100,
    lr_scheduler_factor=0.99,
    lr_scheduler_minimum_lr=0.0001,
    learning_rate=0.1,
)

Training job linear-learner-x20220925-21-40-11
CPU times: user 129 ms, sys: 12.1 ms, total: 141 ms
Wall time: 152 ms


fitting the data with the channels

In [19]:
%%time
linear.fit(inputs={"train": train_data, "validation": validation_data}, job_name=job_name)

2022-09-25 21:40:11 Starting - Starting the training job...
2022-09-25 21:40:34 Starting - Preparing the instances for trainingProfilerReport-1664142011: InProgress
......
2022-09-25 21:41:35 Downloading - Downloading input data......
2022-09-25 21:42:35 Training - Downloading the training image.........
2022-09-25 21:44:00 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/25/2022 21:44:03 INFO 140610159101760] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_

[34m#metrics {"StartTime": 1664142252.0544906, "EndTime": 1664142252.0545335, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 9, "model": 12}, "Metrics": {"validation_absolute_loss_objective": {"sum": 0.11427979242234003, "count": 1, "min": 0.11427979242234003, "max": 0.11427979242234003}}}[0m
[34m#metrics {"StartTime": 1664142252.0546675, "EndTime": 1664142252.0546944, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 9, "model": 13}, "Metrics": {"validation_absolute_loss_objective": {"sum": 0.17288280668712797, "count": 1, "min": 0.17288280668712797, "max": 0.17288280668712797}}}[0m
[34m#metrics {"StartTime": 1664142252.0548255, "EndTime": 1664142252.0548527, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 9, "model": 14}, "Metrics": {"validation_absolute_loss_objective": {"sum": 0.15331393196469262, "count": 1, "min": 0.153313931964


2022-09-25 21:44:37 Uploading - Uploading generated training model
2022-09-25 21:44:37 Completed - Training job completed
Training seconds: 183
Billable seconds: 183
CPU times: user 656 ms, sys: 73.9 ms, total: 730 ms
Wall time: 4min 48s


After training we can deploy the model to an endpoint, where we can do inference (predictions)

In [21]:
%%time
# creating the endpoint out of the trained model
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

--------!
created endpoint: linear-learner-2022-09-25-21-48-03-788
CPU times: user 124 ms, sys: 9.66 ms, total: 133 ms
Wall time: 4min 1s


## Inference

In [59]:
# configure the predictor to accept to serialize csv input and parse the reponse as json
linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

test_data = [l for l in open(file_test_full, "r")]
# we select the last row 
sample = test_data[len(test_data)-1].split(",")
actual_exp = sample[0]
payload = sample[1:]  # removing actual age from the sample
payload = [s.replace("\n", "") for s in payload]
payload = ",".join(map(str, payload))
# Invoke the predicor and analyise the result
result = linear_predictor.predict(payload)
# extracting the prediction value
result = round(float(result["predictions"][0]["score"]), 2)
# compare
accuracy = str(round(100 - ((abs(float(result) - float(actual_exp)) / float(actual_exp)) * 100), 2))
print(f"Actual exp: {actual_exp}\nPrediction: {result}\nAccuracy: {accuracy}")
# transforming to exp
print(f"The actual exp in millions dollars: {np.exp(pd.to_numeric(actual_exp))}\nPrediction in millions dollars: {np.exp(result)}")

Actual exp: 7.165720018880082
Prediction: 7.0
Accuracy: 97.69
The actual exp in millions dollars: 1294.293180909999
Prediction in millions dollars: 1096.6331584284585


## Delete endpoint

In [61]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint_name)
print(f"deleted {linear_predictor.endpoint_name} successfully!")

deleted linear-learner-2022-09-25-21-48-03-788 successfully!
