## Packages import

In [1]:
import os

import mlflow

## Q1.Install MLflow

In [2]:
!mlflow --version

mlflow, version 1.26.1


**Q1**: 1.26.1

## Q2. Download and preprocess the data

In [3]:
!pwd

/home/ubuntu/Projects/mlops/week_2


In [4]:
!mkdir -p input_data
!mkdir -p models

In [5]:
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-01.parquet -O input_data/green_tripdata_2021-01.parquet
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-02.parquet -O input_data/green_tripdata_2021-02.parquet
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-03.parquet -O input_data/green_tripdata_2021-03.parquet

--2022-05-30 00:16:01--  https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-01.parquet
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.15.22
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.15.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1333519 (1,3M) [binary/octet-stream]
Saving to: ‘input_data/green_tripdata_2021-01.parquet’


2022-05-30 00:16:03 (1,65 MB/s) - ‘input_data/green_tripdata_2021-01.parquet’ saved [1333519/1333519]

--2022-05-30 00:16:03--  https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-02.parquet
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.15.22
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.15.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1145679 (1,1M) [binary/octet-stream]
Saving to: ‘input_data/green_tripdata_2021-02.parquet’


2022-05-30 00:16:04 (1,42 MB/s) - ‘input_data/green_tripdata_2021-02.parquet’ saved [1145679/1145679]

--2022-05-

In [6]:
!python preprocess_data.py --raw_data_path input_data --dest_path ./data

In [7]:
data_folder_path = os.path.join(os.getcwd(), 'data')
files = [path for path in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path, path))]

print(f"There are {len(files)} files")

There are 4 files


**Q2**: 4

## Q3. Train a model with autolog

The code of `train.py` is as follows:

In [None]:
import argparse
import os
import joblib

import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    return joblib.load(filename)


def run(data_path):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

    with mlflow.start_run():
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_valid)

        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        # mlflow.log_metric("rmse", rmse)
        print(f"RMSE is {round(rmse, 2)}")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./data",
        help="the location where the processed NYC taxi trip data was saved."
    )
    args = parser.parse_args()

    # In order to access the UI, it is necessary to run the MLFlow server:
    #   mlflow ui --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns
    mlflow.mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment("02-experiment-tracking")
    mlflow.sklearn.autolog()

    run(args.data_path)

![title](./Q3.png)

**Q3**: 17

## Q4. Launch the tracking server locally

The tracking server should be run by means of the following command:

`!mlflow ui --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns`

**Q4**: `--default-artifact-root`

## Q5. Tune the hyperparameters of the model

The code of `hpo.py` is as follows

In [None]:
import argparse
import os
import joblib

import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    return joblib.load(filename)


def run(data_path, num_trials):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "RandomForestRegressor")
            mlflow.log_params(params)

            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_valid)
            rmse = mean_squared_error(y_valid, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./data",
        help="the location where the processed NYC taxi trip data was saved."
    )
    parser.add_argument(
        "--max_evals",
        default=50,
        help="the number of parameter evaluations for the optimizer to explore."
    )
    args = parser.parse_args()

    run(args.data_path, args.max_evals)

![title](./Q5.png)

**Q5**: 6.628

## Q6. Promote the best model to the model registry

The code of `register_model.py` is as follows:

In [None]:
import argparse
import os
import joblib

import mlflow
from hyperopt import hp, space_eval
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

SPACE = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'random_state': 42
}


def load_pickle(filename: str):
    return joblib.load(filename)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        params = space_eval(SPACE, params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # evaluate model on the validation and test sets
        valid_rmse = mean_squared_error(y_valid, rf.predict(X_valid), squared=False)
        mlflow.log_metric("valid_rmse", valid_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)


def run(data_path, log_top):
    client = MlflowClient()

    # retrieve the top_n model runs and log the models to MLflow
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=log_top,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        #train_and_log_model(data_path=data_path, params=run.data.params)
        pass

    # select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1
    )[0]
    print(best_run)

    # register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/models"
    print(f"Model URI is {model_uri}")
    mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./data",
        help="the location where the processed NYC taxi trip data was saved."
    )
    parser.add_argument(
        "--top_n",
        default=5,
        type=int,
        help="the top 'top_n' models will be evaluated to decide which model to promote."
    )
    args = parser.parse_args()

    run(args.data_path, args.top_n)

![title](./Q6a.png)

![title](./Q6b.png)

**Q6**: 6.55