In [2]:
!mlflow --version

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
mlflow, version 2.16.2


Q1: MLflow version: 2.16.2

In [3]:
import os
import pickle
import pandas as pd

from sklearn.feature_extraction import DictVectorizer


def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)


def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv


def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
    )

    # Extract the target
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [6]:
run_data_prep('data/', './output', 'green')

In [7]:
!ls ./output

dv.pkl	test.pkl  train.pkl  val.pkl


Q2: 4

In [21]:
import mlflow
import os
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlflow-homework-2")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_train(data_path: str = './output'):

    mlflow.sklearn.autolog()
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    with mlflow.start_run():
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)

In [22]:
run_train()



In [27]:
from mlflow import MlflowClient


client = MlflowClient()
experiment = client.get_experiment_by_name("mlflow-homework-2")

entity=mlflow.search_runs(experiment_ids=[experiment.experiment_id])
run_id = entity.run_id[0]
client.get_run(run_id).data.to_dictionary()['params']['min_samples_split']

'2'

Q3: 2

Q4: '-default-artifact-root ./mlruns'

In [30]:
import os
import pickle
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlflow-homework-2-q5")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_optimization(data_path: str = './output', num_trials: int = 15):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

        with mlflow.start_run():
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

2024/10/12 07:00:31 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-homework-2-q5' does not exist. Creating a new experiment.


In [31]:
run_optimization()

  0%|                                                            | 0/15 [00:00<?, ?trial/s, best loss=?]





  7%|██▎                                | 1/15 [00:24<05:49, 24.97s/trial, best loss: 5.370086069268862]






 13%|████▋                              | 2/15 [00:32<03:10, 14.65s/trial, best loss: 5.370086069268862]






 20%|███████                            | 3/15 [00:39<02:14, 11.25s/trial, best loss: 5.370086069268862]






 27%|█████████▎                         | 4/15 [00:55<02:23, 13.09s/trial, best loss: 5.357490752366866]






 33%|███████████▋                       | 5/15 [01:07<02:05, 12.60s/trial, best loss: 5.357490752366866]






 40%|██████████████                     | 6/15 [01:31<02:28, 16.49s/trial, best loss: 5.354695072530291]






 47%|████████████████▎                  | 7/15 [01:54<02:28, 18.55s/trial, best loss: 5.354695072530291]






 53%|██████████████████▋                | 8/15 [02:02<01:47, 15.37s/trial, best loss: 5.354695072530291]






 60%|█████████████████████              | 9/15 [02:20<01:37, 16.23s/trial, best loss: 5.354695072530291]






 67%|██████████████████████▋           | 10/15 [02:36<01:20, 16.15s/trial, best loss: 5.354695072530291]






 73%|████████████████████████▉         | 11/15 [02:49<01:00, 15.22s/trial, best loss: 5.335419588556921]






 80%|███████████████████████████▏      | 12/15 [03:01<00:42, 14.17s/trial, best loss: 5.335419588556921]






 87%|█████████████████████████████▍    | 13/15 [03:09<00:24, 12.25s/trial, best loss: 5.335419588556921]






 93%|███████████████████████████████▋  | 14/15 [03:21<00:12, 12.12s/trial, best loss: 5.335419588556921]






100%|██████████████████████████████████| 15/15 [03:37<00:00, 14.50s/trial, best loss: 5.335419588556921]





In [61]:
from mlflow.entities import ViewType

client = MlflowClient()
experiment = client.get_experiment_by_name("mlflow-homework-2-q5")

entity=mlflow.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["metrics.rmse ASC"]
)
run = client.get_run(entity.run_id[0])
run.data.metrics['rmse']

5.335419588556921

Q5: 5.335

In [72]:
import os
import pickle
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "mlflow-homework-2-q5"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)


def run_register_model(data_path: str = './output', top_n: int = 5):

    client = MlflowClient()

    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.test_rmse ASC"]
    )[0]

    model_uri = f"runs:/{run.info.run_id}/model"
    mlflow.register_model(model_uri, name="best-model")



In [73]:
run_register_model()

Successfully registered model 'best-model'.
Created version '1' of model 'best-model'.


In [80]:
client = MlflowClient()

mv = client.get_model_version('best-model', '1')
run = client.get_run(mv.run_id)
run.data.metrics['test_rmse']

5.567408012462019

Q6: 5.567