In [1]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://0.0.0.0:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()



In [2]:
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
        print(test_rmse)
        mlflow.log_metric("test_rmse", test_rmse)

In [7]:
data_path = '/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/output'

In [5]:
pwd

'/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework'

In [8]:
top_n = 5

In [14]:
client = MlflowClient()

In [17]:
client.search_experiments()

[<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/artifacts/3', creation_time=1717093065290, experiment_id='3', last_update_time=1717093065290, lifecycle_stage='active', name='random-forest-best-models', tags={}>,
 <Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/artifacts', creation_time=1717087654894, experiment_id='2', last_update_time=1717087654894, lifecycle_stage='active', name='random_forest_hyperopt', tags={}>,
 <Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/mlruns/1', creation_time=1717065533975, experiment_id='1', last_update_time=1717065533975, lifecycle_stage='active', name='ny_green_taxi_2023', tags={}>,
 <Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/mlruns/0', creation_time=1717065533965, experiment_id='0', last_update_time=1717065533965, lifecycl

In [20]:
client.get_experiment_by_name('random_forest_hyperopt')

<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/artifacts', creation_time=1717087654894, experiment_id='2', last_update_time=1717087654894, lifecycle_stage='active', name='random_forest_hyperopt', tags={}>

In [21]:
client.get_experiment_by_name('random_forest_hyperopt').experiment_id

'2'

In [23]:
experiment = client.get_experiment_by_name('random_forest_hyperopt')

In [26]:
runs = client.search_runs(experiment_ids=experiment.experiment_id,
                   run_view_type=ViewType.ACTIVE_ONLY,
                   max_results=top_n,
                   order_by=["metrics.rmse ASC"])

In [27]:
for run in runs:
    print(run.info.run_id, "top_n_runs")
    train_and_log_model(data_path=data_path, params=run.data.params)

fc982c167f2f4c15b3f3f474bb5d8fe0 top_n_runs




5.567408012462019
d4ccbde73720467784819875836a1bf1 top_n_runs




5.58531221803063
51b9bc505e6541e1b75ba0c54946c7f0 top_n_runs




5.5921322796760755
19e5326e282944d7bd70111ac6dda937 top_n_runs




5.589460017934324
8f3c82d15b69402897e351b97c3e399c top_n_runs




5.5941605655803635


In [29]:
experiment_2 = client.get_experiment_by_name(EXPERIMENT_NAME)

In [37]:
best_run = client.search_runs(experiment_ids=experiment_2.experiment_id, run_view_type=ViewType.ACTIVE_ONLY, order_by=["metrics.test_rmse ASC"])[0]

In [39]:
best_run_id = best_run.info.run_id
model_uri = f"runs:/{best_run_id}/model"

In [40]:
mlflow.register_model(model_uri=model_uri, name="nyc-green-taxi-2023-RF-regressor")

Successfully registered model 'nyc-green-taxi-2023-RF-regressor'.
2024/05/30 19:12:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-green-taxi-2023-RF-regressor, version 1
Created version '1' of model 'nyc-green-taxi-2023-RF-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1717096337737, current_stage='None', description='', last_updated_timestamp=1717096337737, name='nyc-green-taxi-2023-RF-regressor', run_id='1bd9241a63f64c1f91c1a23025e1aa63', run_link='', source='/home/ubuntu/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/artifacts/3/1bd9241a63f64c1f91c1a23025e1aa63/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>