Read experiment data

In [None]:
# set up paths

drive_data_dir = '/MyDrive/Projects/home-co2-forecast/data/'
drive_mlflow_dir = '/MyDrive/Projects/home-co2-forecast/mlflow/'
drive_model_dir = '/MyDrive/Projects/home-co2-forecast/model/'
mount_dir = '/content/drive'
data_dir = mount_dir + drive_data_dir
mlflow_dir = mount_dir + drive_mlflow_dir
model_dir = mount_dir+drive_model_dir

BACKEND_DB = mlflow_dir+"mlflow.db"
ARTIFACT_ROOT = mlflow_dir+"artifacts/"

In [None]:
# mount data source

from google.colab import drive
drive.mount(mount_dir)

In [None]:
# install libraries

!pip install -q mlflow

In [None]:
# import libraries

import os, mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

In [None]:
ARTIFACT_ROOT = mlflow_dir+"artifacts/"
os.environ["MLFLOW_ARTIFACT_ROOT"]=ARTIFACT_ROOT

In [None]:
# point to the database

mlflow.set_tracking_uri("sqlite:////"+mlflow_dir+"mlflow.db")

In [None]:
# point to the experiment

experiment = mlflow.set_experiment("model_parameters_search_weight")
experiment

In [None]:
# initiate client
client = MlflowClient()


In [None]:
client.delete_experiment(2)

In [None]:
ex = mlflow.search_experiments()

In [None]:
ex

In [None]:
# get all runs in the experiment

runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

In [None]:
runs_df

In [None]:
# get the best row by r2

best_row = runs_df.loc[runs_df["metrics.r2"].idxmax()]

In [None]:
best_row

In [None]:
# get 10 best rows with minimal RMSE

best_10_rmse = runs_df.nsmallest(20, 'metrics.rmse')
best_10_rmse_df = best_10_rmse[["run_id","metrics.r2", "metrics.rmse", "params.model", "params.feature_set",
                                                           "params.n_estimators", "params.max_depth", "metrics.model_size_mb"]]
best_10_rmse_df

In [None]:
# get 1 best rows with minimal RMSE

best_1_rmse = runs_df.nsmallest(1, 'metrics.rmse')
best_1_rmse_df = best_1_rmse[["run_id","metrics.r2", "metrics.rmse", "params.model", "params.features", "params.feature_set",
                                                           "params.n_estimators", "params.max_depth"]]
best_run_id = best_1_rmse_df["run_id"].iloc[0]
best_run_features = best_1_rmse_df["params.features"].iloc[0]
best_1_rmse_df

In [None]:
#best_1_rmse_df['params.features'].iloc[0]
best_run_features

# Explode the string best_run_features into a list, using comma as a separator
best_run_feature_list = best_run_features.split(',')
best_run_feature_list

In [None]:
import mlflow.sklearn
model = mlflow.sklearn.load_model("runs:/" + best_run_id+"/model")

In [None]:
et = model  # or model.named_steps["estimator"]

print("Trees:", et.n_estimators)
print("Avg depth:", sum(t.tree_.max_depth for t in et.estimators_) / et.n_estimators)
print("Max depth:", max(t.tree_.max_depth for t in et.estimators_))

In [None]:
# create a dictionary containing the model and the training features
model_dict = {"model":model, "feature_names":best_run_feature_list}

In [None]:
model_dict

In [None]:
import pickle
with open(model_dir+'model-regressor.pkl', 'wb') as f:
    pickle.dump(model_dict, f)

In [None]:
import joblib

# save
joblib.dump(model_dict, model_dir+'model-regressor-jl.pkl')