Run many applicable models to select best candidates of feature set + model + parameters  

In [None]:
# set up paths

drive_data_dir = '/MyDrive/Projects/home-co2-forecast/data/'
drive_mlflow_dir = '/MyDrive/Projects/home-co2-forecast/mlflow/'
drive_model_dir = '/MyDrive/Projects/home-co2-forecast/model/'
mount_dir = '/content/drive'
data_dir = mount_dir + drive_data_dir
mlflow_dir = mount_dir + drive_mlflow_dir
model_dir = mount_dir + drive_model_dir

temp_model_dir = '/content/temp_model/'

BACKEND_DB = mlflow_dir+"mlflow.db"
ARTIFACT_ROOT = mlflow_dir+"artifacts/"

In [None]:
# mount data source

from google.colab import drive
drive.mount(mount_dir)

In [None]:
# install libraries

!pip install -q mlflow

In [None]:
# import libraries

import os, mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

In [None]:
# create artifact directories if needed

os.makedirs(ARTIFACT_ROOT, exist_ok=True)

In [None]:
# create temp directory for assessing model size, if needed

os.makedirs(temp_model_dir, exist_ok=True)

In [None]:
# set mlflow tracking
mlflow.set_tracking_uri("sqlite:////"+BACKEND_DB)


In [None]:
def set_create_experiment(experiment_name, artifact_location):
  """
  Set experiment by name, if exists, otherwise create experiment using artifact location.
  """
  experiment = mlflow.get_experiment_by_name(experiment_name)
  print(experiment)
  if experiment is None:
    experiment = mlflow.create_experiment(experiment_name, artifact_location = artifact_location)
    print(experiment)
  mlflow.set_experiment(experiment_name)
  return experiment



In [None]:
# set or create experiment
experiment = set_create_experiment("model_search_kn", artifact_location=ARTIFACT_ROOT)

In [None]:
ex = mlflow.search_experiments()
ex

In [None]:
# read train and test data

X_train = pd.read_pickle(data_dir + 'X_train.pkl')
X_test = pd.read_pickle(data_dir + 'X_test.pkl')
y_train = pd.read_pickle(data_dir + 'y_train.pkl')
y_test = pd.read_pickle(data_dir + 'y_test.pkl')


In [None]:
len(X_train)

In [None]:
# define feature configurations

feature_sets = {
    #"all_features": X_train.columns.tolist(),
    #"subset_1": ['roll10_mean',
    #             'roll10_std',
    #             'roll20_mean',
    #             'roll20_std',
    #             'roll50_mean',
    #             'roll50_std',],
    #"subset_2": ['roll10_mean',
    #             'roll10_std',
    #             'roll20_mean',
    #             'roll20_std',
    #             'roll50_mean',
    #             'roll50_std',
    #             'minute',
    #              'hour',
    #              'dayofweek',
    #              'is_weekend',
    #              'is_holiday',
    #              'day_off'],
    # "subset_3": ['lag_10',
    #              'lag_20',
    #              'lag_30',
    #              'lag_60'],
    "subset_4": ['roll10_mean',
                 'roll10_std',
                 'roll20_mean',
                 'roll20_std',
                 'roll50_mean',
                 'roll50_std',
                 'minute',
                 'hour',
                 'dayofweek',
                 'is_weekend',
                 'is_holiday',
                 'day_off',
                 'hr_sin',
                 'hr_cos',
                 ],
    # "subset_5": ['roll10_mean',
    #              'roll10_std',
    #              'roll20_mean',
    #              'roll20_std',
    #              'roll50_mean',
    #              'roll50_std',
    #              'dayofweek',
    #              'is_weekend',
    #              'is_holiday',
    #              'day_off',
    #              'hr_sin',
    #              'hr_cos',
    #              ],
}

In [None]:
# import models

from sklearn.linear_model import (LinearRegression,
                                  ElasticNet,
                                  Ridge,
                                  Lasso,
                                  HuberRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor,
                              GradientBoostingRegressor,
                              ExtraTreesRegressor)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn import metrics

In [None]:
# define model and model parameters configurations

model_configs = [
    # ("LinearRegression", LinearRegression, [{"fit_intercept": True}]),
    # ("DecisionTreeRegressor", DecisionTreeRegressor, [{"max_depth": 20}]),
    # ("ElasticNet", ElasticNet, [
    #     {"alpha": 0.1, "l1_ratio": 0.5},
    #     {"alpha": 0.5, "l1_ratio": 0.5},
    #     {"alpha": 1.0, "l1_ratio": 0.3},
    # ]),
    # ("Ridge", Ridge, [
    #     {"alpha": 0.1},
    #     {"alpha": 0.5},
    #     {"alpha": 1.0},
    #     {"alpha": 10.0},
    # ]),
    # ("Lasso", Lasso, [
    #     {"alpha": 0.1},
    #     {"alpha": 0.5},
    #     {"alpha": 1.0},
    # ]),
    # ("HuberRegressor", HuberRegressor, [
    #     {"epsilon": 1.35, "alpha": 0.0001},  # default epsilon
    #     {"epsilon": 1.8, "alpha": 0.001},
    # ]),
    # ("RandomForestRegressor", RandomForestRegressor, [
    #     {"n_estimators": 100, "max_depth": 5, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 10, "random_state": 42},
    # ]),
    # ("GradientBoostingRegressor", GradientBoostingRegressor, [
    #     {"n_estimators": 100, "max_depth": 3, "learning_rate": 0.1, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 4, "learning_rate": 0.05, "random_state": 42},
    # ]),
    # ("ExtraTreesRegressor", ExtraTreesRegressor, [
    #     # {"n_estimators": 100, "max_depth": 5, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": None, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 8, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 12, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 16, "random_state": 42},
    #     {"n_estimators": 200, "max_depth": 20, "random_state": 42},

    # ]),
    #("SVR", SVR, [
    #    {"C": 1.0, "kernel": "rbf", "gamma": "scale"},
    #    # {"C": 10.0, "kernel": "linear"},  # too long
    #]), # unfeasible computation time
    ("KNeighborsRegressor", KNeighborsRegressor, [
        {"n_neighbors": 5, "weights": "uniform"},
        {"n_neighbors": 6, "weights": "uniform"},
        {"n_neighbors": 7, "weights": "uniform"},
        {"n_neighbors": 8, "weights": "uniform"},
        {"n_neighbors": 9, "weights": "uniform"},
        {"n_neighbors": 10, "weights": "uniform"},
        {"n_neighbors": 11, "weights": "uniform"},
        {"n_neighbors": 12, "weights": "uniform"},
        {"n_neighbors": 13, "weights": "uniform"},
        {"n_neighbors": 14, "weights": "uniform"},
        {"n_neighbors": 5, "weights": "distance"},
        {"n_neighbors": 6, "weights": "distance"},
        {"n_neighbors": 7, "weights": "distance"},
        {"n_neighbors": 8, "weights": "distance"},
        {"n_neighbors": 9, "weights": "distance"},
        {"n_neighbors": 10, "weights": "distance"},
        {"n_neighbors": 11, "weights": "distance"},
        {"n_neighbors": 12, "weights": "distance"},
        {"n_neighbors": 13, "weights": "distance"},
        {"n_neighbors": 14, "weights": "distance"},

    ]),
]


In [None]:
# run the experiment and log the results into mlflow

prev_time = pd.to_datetime('now')

for fs_name, fs_cols in feature_sets.items():
    Xtr = X_train[fs_cols]
    Xte = X_test[fs_cols]

    for model_name, ModelClass, param_configs in model_configs:
      print(model_name)
      print(ModelClass)
      print(param_configs)
      for params in param_configs:
        print(params)
        print(f"{model_name}_{params}_{fs_name}")
        with mlflow.start_run(run_name=f"{model_name}_{params}_{fs_name}"):
            mlflow.log_param("feature_set", fs_name)
            mlflow.log_param("features", ",".join(fs_cols))
            mlflow.log_param("model", model_name)

            mlflow.log_params(params)


            model = ModelClass(**params)
            model.fit(Xtr, y_train)


            y_pred = model.predict(Xte)
            rmse = metrics.mean_squared_error(y_test, y_pred)
            r2 = metrics.r2_score(y_test, y_pred)


            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)



            # Assess size of the model
            import pickle
            temp_model_path = temp_model_dir +'model-curr.pkl'
            with open(temp_model_path, 'wb') as f:
                pickle.dump(model, f)

            print(temp_model_path)

            file_size = os.path.getsize(temp_model_path)
            print(f"File size: {file_size} bytes")


            mlflow.log_metric("model_size_bytes", file_size)
            mlflow.log_metric("model_size_mb", file_size / (1024 ** 2))

            mlflow.sklearn.log_model(model, name="model")
            current_time = pd.to_datetime('now')
            current_time_local = current_time.tz_localize('UTC').tz_convert('America/Toronto')
            time_diff =  (current_time-prev_time).seconds
            print(f"Seconds elapsed: {time_diff} \n Local time: {current_time_local}\n\n")
            prev_time = current_time
