In [1]:
!mlflow --version

mlflow, version 2.13.0


In [24]:
import os
import pickle
import click
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [10]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [15]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [18]:
mlflow.set_experiment("test-experiment-1")
data_path = './output'

mlflow.autolog()

with mlflow.start_run():
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)

2024/05/29 21:39:24 INFO mlflow.tracking.fluent: Experiment with name 'test-experiment-1' does not exist. Creating a new experiment.
2024/05/29 21:39:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [19]:
mlflow.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-mlflow/artifacts/1', creation_time=1717018764105, experiment_id='1', last_update_time=1717018764105, lifecycle_stage='active', name='test-experiment-1', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-mlflow/artifacts/0', creation_time=1717018638717, experiment_id='0', last_update_time=1717018638717, lifecycle_stage='active', name='Default', tags={}>]

In [26]:
mlflow.set_experiment("random-forest-hyperopt")
data_path = './output'
num_trials = 15


X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'random_state': 42
}

rstate = np.random.default_rng(42)  # for reproducible results
fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=num_trials,
    trials=Trials(),
    rstate=rstate
)

  0%|                                    | 0/15 [00:00<?, ?trial/s, best loss=?]




  7%|▋          | 1/15 [00:10<02:28, 10.60s/trial, best loss: 5.370086069268862]




 13%|█▍         | 2/15 [00:13<01:19,  6.13s/trial, best loss: 5.370086069268862]




 20%|██▏        | 3/15 [00:16<00:57,  4.81s/trial, best loss: 5.370086069268862]




 27%|██▉        | 4/15 [00:24<01:06,  6.07s/trial, best loss: 5.357490752366866]




 33%|███▋       | 5/15 [00:30<00:58,  5.82s/trial, best loss: 5.357490752366866]




 40%|████▍      | 6/15 [00:42<01:10,  7.89s/trial, best loss: 5.354695072530291]




 47%|█████▏     | 7/15 [00:53<01:12,  9.08s/trial, best loss: 5.354695072530291]




 53%|█████▊     | 8/15 [00:57<00:51,  7.37s/trial, best loss: 5.354695072530291]




 60%|██████▌    | 9/15 [01:05<00:45,  7.65s/trial, best loss: 5.354695072530291]




 67%|██████▋   | 10/15 [01:12<00:37,  7.56s/trial, best loss: 5.354695072530291]




 73%|███████▎  | 11/15 [01:19<00:28,  7.09s/trial, best loss: 5.335419588556921]




 80%|████████  | 12/15 [01:24<00:19,  6.63s/trial, best loss: 5.335419588556921]




 87%|████████▋ | 13/15 [01:28<00:11,  5.72s/trial, best loss: 5.335419588556921]




 93%|█████████▎| 14/15 [01:34<00:05,  5.76s/trial, best loss: 5.335419588556921]




100%|██████████| 15/15 [01:41<00:00,  6.79s/trial, best loss: 5.335419588556921]


{'max_depth': 19.0,
 'min_samples_leaf': 2.0,
 'min_samples_split': 2.0,
 'n_estimators': 11.0}

In [30]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()
top_n = 5

In [28]:
def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)

In [31]:
client = MlflowClient()

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.rmse ASC"]
)
for run in runs:
    train_and_log_model(data_path=data_path, params=run.data.params)

# Select the model with the lowest test RMSE
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)



In [55]:
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["metrics.test_rmse ASC"]
)[0]

run_id = best_run.info.run_id
# Register the best model

mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name='lyrical-mule-285'
)

Successfully registered model 'lyrical-mule-285'.
2024/05/29 22:24:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lyrical-mule-285, version 1
Created version '1' of model 'lyrical-mule-285'.


<ModelVersion: aliases=[], creation_timestamp=1717021450410, current_stage='None', description='', last_updated_timestamp=1717021450410, name='lyrical-mule-285', run_id='cb3ad6b8996f410f9205973e7349c269', run_link='', source='/workspaces/mlops-zoomcamp/02-mlflow/artifacts/3/cb3ad6b8996f410f9205973e7349c269/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>