In [12]:
model_name = "bagging"
random_state = 42
bayesian_search = True

print(f"model_name: {model_name}")
print(f"bayesian_search: {bayesian_search}")

model_name: bagging
bayesian_search: True


In [13]:
# Get the Amazon SageMaker Boto 3 Client
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

sess = sagemaker.Session()

## Region and SageMaker Client
region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
print(f"Region: {region}")

## Get the SageMaker IAM Execution Role
from sagemaker import get_execution_role
"""
Get the execution role for the notebook instance. 
This is the IAM role that you created for your notebook instance.
You pass the role to the tuning job. 
This is the role that SageMaker would use to leverage AWS resources
""" 
role = get_execution_role()
print(f"Execution role: {role}")

## Specify a Bucket and Data Output Location
# bucket = sess.default_bucket() # Replace with your own bucket name if needed
bucket = 'petprojects'
prefix = 'gscreen'
## AWS Sagemaker: Define path to save models
output_dir = f"s3://{bucket}/{prefix}/output"
print(f"Output folder: {output_dir}")

Region: us-east-1
Execution role: arn:aws:iam::600890512379:role/service-role/AmazonSageMaker-ExecutionRole-20210115T110258
Output folder: s3://petprojects/gscreen/output


In [14]:
import os
import sys
import urllib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import time

from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn import tree
from sklearn import ensemble

try:skl = Pipeline.__module__[:Pipeline.__module__.index(".")]
except:skl = Pipeline.__module__

print(f"Numpy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Matplotlib: {sys.modules[plt.__package__].__version__}")
print(f"Scikit-learn: {sys.modules[skl].__version__}")

Numpy: 1.19.4
Pandas: 1.0.1
Matplotlib: 3.1.3
Scikit-learn: 0.22.1


In [15]:
try:
  from skopt import BayesSearchCV
  print(f"The scikit-optimize package exist.")
except Exception as e:
  print(f"Import scikit-optimize error: {e}")
  print(f"Try to re-install scikit-optimize package")
  ! pip install scikit-optimize

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

print (f"{sys.modules[BayesSearchCV.__module__[:BayesSearchCV.__module__.index('.')]].__version__}")

The scikit-optimize package exist.
0.8.1


In [16]:
# Load preprocessed data from S3
dir="gscreen/data/"
s3client = boto3.client('s3')

s3client.download_file(bucket, dir+'X_fit.joblib.compressed', 'X_fit.joblib.compressed')
with open(f"X_fit.joblib.compressed", "rb") as f:
    X_fit = joblib.load(f)

s3client.download_file(bucket, dir+'X_train.joblib.compressed', 'X_train.joblib.compressed')
with open(f"X_train.joblib.compressed", "rb") as f:
    X_train = joblib.load(f)

s3client.download_file(bucket, dir+'X_val.joblib.compressed', 'X_val.joblib.compressed')
with open(f"X_val.joblib.compressed", 'rb') as f:
    X_val = joblib.load(f)

print(f"X_fit: {X_fit.shape}, {type(X_fit)}\
\nX_train: {X_train.shape}, {type(X_train)}\
\nX_val: {X_val.shape}, {type(X_val)}\n")

s3client.download_file(bucket, dir+'y_fit.joblib', 'y_fit.joblib')
with open(f"y_fit.joblib", "rb") as f:
    y_fit = joblib.load(f)

s3client.download_file(bucket, dir+'y_train.joblib', 'y_train.joblib')
with open(f"y_train.joblib", "rb") as f:
    y_train = joblib.load(f)

s3client.download_file(bucket, dir+'y_val.joblib', 'y_val.joblib')
with open(f"y_val.joblib", "rb") as f:
    y_val = joblib.load(f)

print(f"y_fit: {y_fit.shape}, {type(y_fit)}\
\ny_train: {y_train.shape}, {type(y_train)}\
\ny_val: {y_val.shape}, {type(y_val)}\n")

X_fit: (296721, 279), <class 'scipy.sparse.csr.csr_matrix'>
X_train: (296727, 279), <class 'scipy.sparse.csr.csr_matrix'>
X_val: (5000, 279), <class 'scipy.sparse.csr.csr_matrix'>

y_fit: (296721,), <class 'numpy.ndarray'>
y_train: (296727,), <class 'numpy.ndarray'>
y_val: (5000,), <class 'numpy.ndarray'>



In [17]:
#%% Define model parameters for starting tuning
model_params = {
    "base_estimator": tree.ExtraTreeRegressor(
        criterion="mse",  # {"mse", "friedman_mse", ""mae"} default="mse"
        splitter="random",  # {"random", "best"} default="random"
        max_depth=None,  # default=None
        min_samples_split=2,  # default=2
        min_samples_leaf=1,  # default=1
        random_state=random_state,
    ),
    "n_estimators": 80,
    "max_samples": 1.0,
    "max_features": 1.0,
    "bootstrap": True,
    "bootstrap_features": False,
    "oob_score": False,
    "n_jobs": None,
    "random_state": random_state,
    "verbose": 3,
}
model = ensemble.BaggingRegressor(**model_params)

In [18]:
#%% ----------------------- Bayesian Optimization ------------------------------
# Their core idea of Bayesian Optimization is simple:
# when a region of the space turns out to be good, it should be explored more.
# Real: Continuous hyperparameter space.
# Integer: Discrete hyperparameter space.
# Categorical: Categorical hyperparameter space.
bayes_space = {
    "n_estimators": Integer(50, 100),
    "max_samples": Real(0.5, 1.0),
    # "max_features": Real(0.7, 1.0),
    "bootstrap": Categorical([True, False]),
    # "bootstrap_features": Categorical([True, False]),
}

In [19]:
def accuracy(real_rates, predicted_rates):
    """Project's accuracy value estimator"""
    return np.average(abs(real_rates / predicted_rates - 1.0)) * 100.0

def calc_metrics(model, X, y):
    """Calculates result metrics"""

    from sklearn.metrics import max_error
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error

    y_true = y
    y_pred = model.predict(X)

    me = max_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mare = accuracy(y_true, y_pred)

    print(f"\tMax Error: {me}")
    print(f"\tMean Absolute Error: {mae}")
    print(f"\tRoot Mean Squared Error: {rmse}")
    print(f"\tMean Absolute Ratio Error: {mare}")

In [20]:
#%% Define custom scorer to evaluate basic models
scorer = make_scorer(
    score_func=accuracy,
    greater_is_better=True,  # Whether score_func is a score function (default),
    # meaning high is good, or a loss function, meaning low is good.
)
# Define classic cross validation method and params
cv = model_selection.RepeatedKFold(
    n_splits=10,  #! Dont forget
    n_repeats=2,
    random_state=random_state,
)

In [None]:
#%% ----------------------- Bayesian Optimization ------------------------------
bs_bp, bs_be = {}, {}
if bayesian_search:
    tic = time.time()
    # Define bayesian search
    bayes_search = BayesSearchCV(
        model,
        search_spaces=bayes_space,
        n_iter=30,  #! default=50 Number of parameter settings that are sampled.
        scoring=scorer,
        n_jobs=None,  # Number of jobs to run in parallel. -1 = using all processors
        cv=cv,  # default 3-fold cross validation.
        refit=True,
        verbose=3,
        return_train_score=True,
        random_state=random_state,
    )
    # Make search
    model_bayes_search = bayes_search.fit(
        X_fit,
        y_fit,
    ) 

Fitting 20 folds for each of 1 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [None]:
# Print out best parameters
if bayesian_search:
    bs_bp = model_bayes_search.best_params_
    bs_be = model_bayes_search.best_estimator_
    print(f"Bayesian search:\nBest params are:\n {bs_bp}")
    print(f"{bs_be}")
    print(f"\nBest score is:\n{model_bayes_search.best_score_}")

    min, sec = divmod(time.time() - tic, 60)
    print(f"\nBayesian search taken: {int(min)}min {int(sec)}sec")

In [None]:
# Print out results
if bayesian_search:
    model = model_bayes_search.best_estimator_
    print(f"{model_name.title()} Bayesian search:")
    print("TRAIN set:")
    calc_metrics(model, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model, X=X_val, y=y_val)

In [None]:
#%% Fit the model w\o parameters searching -------------------------------------
if not bayesian_search:
    # Train the model
    tic = time.time()
    model.fit(X_fit, y_fit)
    # Evaluate time spent
    min, sec = divmod(time.time() - tic, 60)
    print(f"Time taken: {int(min)}min {int(sec)}sec")
    print(f"{model}\n")

    # Print out results
    print(f"{model_name.title()} model:")
    print("TRAIN set:")
    calc_metrics(model, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model, X=X_val, y=y_val)