### Question 1

In [1]:
!mlflow --version

mlflow, version 2.22.0


### Question 2

In [6]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet -O ./data/green_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet -O ./data/green_tripdata_2023-02.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet -O ./data/green_tripdata_2023-03.parquet

--2024-05-27 20:55:04--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.192.2, 13.32.192.116, 13.32.192.124, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.192.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427002 (1.4M) [binary/octet-stream]
Saving to: ‘./data/green_tripdata_2023-01.parquet’


2024-05-27 20:55:05 (74.1 MB/s) - ‘./data/green_tripdata_2023-01.parquet’ saved [1427002/1427002]

--2024-05-27 20:55:05--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.192.2, 13.32.192.124, 13.32.192.116, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.192.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1533740 (1.5M) [binary/octet-stream

4 files were output to the output file after running preprocess_data.py
- dv.pkl
- test.pkl
- train.pkl
- val.pkl

### Question 3

In [7]:
import pickle

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [17]:
X_train, y_train = load_pickle("./output/train.pkl")
X_val, y_val = load_pickle("./output/val.pkl")
X_test, y_test = load_pickle("./output/test.pkl")

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import mlflow 

In [None]:

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

mlflow.autolog()

with mlflow.start_run():
    rf = RandomForestRegressor(max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)

2025/05/19 00:30:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


### Question 4

This is the command for launching a tracking server, with a sqlite DB for backend store and a artifacts folder for the artifacts store: 

`mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts`

### Question 5

In [39]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe   

def objective(params):
        with mlflow.start_run():
                mlflow.set_tag("model", "xgboost")
                mlflow.log_params(params)

                rf = RandomForestRegressor(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_val)
                rmse = root_mean_squared_error(y_val, y_pred)
                mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}


In [40]:
from hyperopt.pyll import scope
search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

In [None]:
import numpy as np 
mlflow.set_experiment("random-forest-hyperopt")

rstate = np.random.default_rng(42)  # for reproducible results

fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=15,
        trials=Trials(),
        rstate=rstate
    )

  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]

### Question 6

In [13]:
from mlflow.tracking import MlflowClient
client = MlflowClient() 


In [15]:
experiment = client.get_experiment_by_name("random-forest-hyperopt")

In [25]:
hopt_best_runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    filter_string="",
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [33]:
mlflow.set_experiment("random-forest-best-models")
mlflow.autolog()

for run in hopt_best_runs:
    with mlflow.start_run():
    
        print(f"Run ID: {run.info.run_id}")
        print(f"Parameters: {run.data.params}")
        print(f"Train RMSE: {run.data.metrics['rmse']}")
        
        model_params = {}
        for param in run.data.params:
            # Convert to int if it's a string representation of an int
            model_params[param] = int(run.data.params[param])

        model = RandomForestRegressor(**model_params)
        model.fit(X_train, y_train)

        val_rmse = root_mean_squared_error(y_val, model.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)

        test_rmse = root_mean_squared_error(y_test, model.predict(X_test))
        mlflow.log_metric("test_rmse", test_rmse)   
    

2025/05/20 17:35:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Run ID: 81125942d9234d5c85487dbca5e01ead
Parameters: {'max_depth': '19', 'min_samples_leaf': '2', 'min_samples_split': '2', 'n_estimators': '11', 'random_state': '42'}
Train RMSE: 5.567408012462019
Run ID: a036ef09c6154e158944fb1ba94f67c4
Parameters: {'max_depth': '15', 'min_samples_leaf': '2', 'min_samples_split': '3', 'n_estimators': '40', 'random_state': '42'}
Train RMSE: 5.58531221803063
Run ID: a2043acaa5fb45a39e43464324f6ab24
Parameters: {'max_depth': '14', 'min_samples_leaf': '3', 'min_samples_split': '4', 'n_estimators': '26', 'random_state': '42'}
Train RMSE: 5.589460017934324
Run ID: 83f7991303d548c1aeee521d50a25fd9
Parameters: {'max_depth': '20', 'min_samples_leaf': '1', 'min_samples_split': '9', 'n_estimators': '19', 'random_state': '42'}
Train RMSE: 5.5921322796760755
Run ID: 007e8f577fd542ff9441f1df431cc7d3
Parameters: {'max_depth': '14', 'min_samples_leaf': '2', 'min_samples_split': '6', 'n_estimators': '23', 'random_state': '42'}
Train RMSE: 5.5941605655803635


In [34]:
best_model_experiment = client.get_experiment_by_name("random-forest-best-models")

In [36]:
best_test_rmse_run = client.search_runs(
    experiment_ids=best_model_experiment.experiment_id,
    filter_string="",
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.test_rmse ASC"]
)

In [None]:
mlflow.register_model(
    f"runs:/{best_test_rmse_run[0].info.run_id}/model",
    "nyc-taxi-tuned-regressor-model"
)

Successfully registered model 'nyc-taxi-tuned-regressor-model'.
Created version '1' of model 'nyc-taxi-tuned-regressor-model'.


<ModelVersion: aliases=[], creation_timestamp=1747777902845, current_stage='None', description=None, last_updated_timestamp=1747777902845, name='nyc-taxi-tuned-regressor-model', run_id='c5ce4c987c684e7cb228030d236e7f1a', run_link=None, source='file:///c:/Users/khanm375/Documents/mlops/02-experiment-tracking/mlruns/4/c5ce4c987c684e7cb228030d236e7f1a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [38]:
client.set_registered_model_alias(name="nyc-taxi-tuned-regressor-model", alias="Production", version=1)