In [1]:
import os
import pickle
import click

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
import mlflow

## Q1. Install MLflow
What's the version that you have?

In [4]:
!mlflow --version

mlflow, version 2.13.0


## Q2. Download and preprocess the data
How many files were saved to OUTPUT_FOLDER?

In [5]:
!bash preprocess_data.sh

In [6]:
from pathlib import Path

In [16]:
output_folder = Path("output")

In [18]:
number_of_files = len(list(output_folder.glob("*.*")))
print(f"Total number of files: {number_of_files}")

Total number of files: 4


## Q3. Train a model with autolog
What is the value of the min_samples_split parameter

In [19]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/mnt/storage/Courses/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1716599938104, experiment_id='1', last_update_time=1716599938104, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [20]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


# @click.command()
# @click.option(
#     "--data_path",
#     default="./output",
#     help="Location where the processed NYC taxi trip data was saved"
# )
def run_train(data_path: str):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Manuel Rios")

        mlflow.sklearn.autolog()

        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)

        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(rf, artifact_path="models_mlflow")


In [21]:
data_path = "./output"
run_train(data_path)



In [22]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [23]:
from mlflow.entities import ViewType

In [29]:
runs = client.search_runs(
    experiment_ids='1'
)


In [43]:
min_samples_split = runs[0].data.params["min_samples_split"]
print(f"Random forest regressor min samples split: {min_samples_split}")


Random forest regressor min samples split: 2


## Q4. Launch the tracking server locally
In addition to `backend-store-uri`, what else do you need to pass to properly configure the server?


`mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts_local`

## Q5. Tune model hyperparameters


In [166]:
! bash hpo.sh

2024/05/25 21:13:00 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.















100%|██████████| 15/15 [01:27<00:00,  5.83s/trial, best loss: 5.335419588556921]


In [167]:
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [168]:
client.search_experiments()

[<Experiment: artifact_location='/mnt/storage/Courses/mlops-zoomcamp/02-experiment-tracking/artifacts_local/1', creation_time=1716693180561, experiment_id='1', last_update_time=1716693180561, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>,
 <Experiment: artifact_location='/mnt/storage/Courses/mlops-zoomcamp/02-experiment-tracking/artifacts_local/0', creation_time=1716693148308, experiment_id='0', last_update_time=1716693148308, lifecycle_stage='active', name='Default', tags={}>]

In [169]:
runs = client.search_runs(
    experiment_ids='1'
    )

In [170]:
rmse_list = []
for i,run in enumerate(runs):
    rmse_list.append(run.data.metrics['rmse'])
    #print(f"{i} \t {run.data.metrics['rmse']}")

rmse_list.sort()
print(f"The minimum rmse is: {rmse_list[0]}")

The minimum rmse is: 5.335419588556921


## Q6. Promote the best model to the model registry
What is the test RMSE of the best model?


In [171]:
! bash register_model.sh

2024/05/25 21:14:39 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'nyc-taxi-regressor'.
2024/05/25 21:15:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.


In [201]:
model = client.search_registered_models()
run_id = model[0].latest_versions[0].run_id
run = client.get_run(run_id)
rmse_test = run.data.metrics['test_rmse']
print(f"The best test rmse is: {rmse_test}")

The best test rmse is: 5.567408012462019
