## Homework 2

In [1]:
import mlflow
import pandas as pd
import os

In [2]:
TAXI_DATA_FOLDER = "taxi_data"

### Q1 mlflow version

In [3]:
!mlflow --version

mlflow, version 2.22.0


### Q2 Downloading data

In [4]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    return df

In [5]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet')
df_test = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet')

df_train.to_parquet( os.path.join( TAXI_DATA_FOLDER, 'green_tripdata_2023-01.parquet') )
df_val.to_parquet( os.path.join( TAXI_DATA_FOLDER, 'green_tripdata_2023-02.parquet') )
df_test.to_parquet( os.path.join( TAXI_DATA_FOLDER, 'green_tripdata_2023-03.parquet') )

In [6]:
!python preprocess_data.py --raw_data_path taxi_data --dest_path ./output

How many files are saved to ./output folder?

In [9]:
print(f"output folder contains { len( os.listdir('./output') ) } files")

output folder contains 4 files


### Q3 Train a model with autolog

In [10]:
!python train.py 

What is the value of the min_samples_split parameter?

In [19]:
experiment_id = mlflow.get_experiment_by_name("nyc-taxi-experiment_hw2_q3").experiment_id
runs = mlflow.search_runs(experiment_ids=[experiment_id])
min_samples_split = runs['params.min_samples_split'].values[0]
print(f"min_samples_split = {min_samples_split}")

min_samples_split = 2


### Q4  Launch the tracking server locally

In [25]:
mlflow.set_tracking_uri("sqlite:///mlflow_hw2.db")
#!mlflow server --backend-store-uri sqlite:///mlflow_hw2.db --default-artifact-root ./artifacts --host 127.0.0.1 --port 5000


In [21]:
# if accidently remove experiments, restore them
#from mlflow.tracking import MlflowClient

#mlflow.set_tracking_uri("sqlite:///mlflow_hw2.db")
#client = MlflowClient()
#deleted_experiments = client.search_experiments(view_type=mlflow.entities.ViewType.DELETED_ONLY)
#for experiment in deleted_experiments:
#    print(f"ID: {experiment.experiment_id}, Name: {experiment.name}, Lifecycle Stage: {experiment.lifecycle_stage}")
#    client.restore_experiment(experiment.experiment_id)

### Q5. Tune model hyperparameters

In [26]:
!python hpo.py


  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]
  7%|▋         | 1/15 [00:11<02:47, 11.97s/trial, best loss: 5.370086069268862]
 13%|█▎        | 2/15 [00:12<01:08,  5.26s/trial, best loss: 5.370086069268862]
 20%|██        | 3/15 [00:13<00:38,  3.25s/trial, best loss: 5.370086069268862]
 27%|██▋       | 4/15 [00:20<00:53,  4.82s/trial, best loss: 5.357490752366866]
 33%|███▎      | 5/15 [00:24<00:44,  4.42s/trial, best loss: 5.357490752366866]
 40%|████      | 6/15 [00:36<01:04,  7.17s/trial, best loss: 5.354695072530291]
 47%|████▋     | 7/15 [00:49<01:10,  8.86s/trial, best loss: 5.354695072530291]
 53%|█████▎    | 8/15 [00:50<00:45,  6.57s/trial, best loss: 5.354695072530291]
 60%|██████    | 9/15 [00:58<00:41,  6.91s/trial, best loss: 5.354695072530291]
 67%|██████▋   | 10/15 [01:04<00:33,  6.74s/trial, best loss: 5.354695072530291]
 73%|███████▎  | 11/15 [01:09<00:24,  6.17s/trial, best loss: 5.335419588556921]
 80%|████████  | 12/15 [01:13<00:16,  5.50s/trial, best loss: 

What's the best validation RMSE that you got?

In [29]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
from mlflow.tracking import MlflowClient
client = MlflowClient()

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,       
        max_results=1,
        order_by=["metrics.rmse ASC"]
    )[0]

print("Best rmse: ", best_run.data.metrics["rmse"])


Best rmse:  5.335419588556921


### Question 6. Promote the best model to the model registry

What is the test RMSE of the best model?

In [35]:
EXPERIMENT_NAME = "random-forest-best-models"
from mlflow.tracking import MlflowClient
client=MlflowClient()

experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs( experiment_ids = [experiment.experiment_id], 
                                   order_by = ["metrics.test_rmse ASC"], 
                                   max_results = 5  )[0]

print("Best test rmse: ", best_run.data.metrics["test_rmse"])


Best test rmse:  5.567408012462019


In [33]:
!python register_model.py

🏃 View run unruly-grub-292 at: http://127.0.0.1:5000/#/experiments/2/runs/d4594d2fa64c474993e1e48096cad18b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2
🏃 View run ambitious-shad-270 at: http://127.0.0.1:5000/#/experiments/2/runs/1bf4f17bc3914edd8112cacf52015e7c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2
🏃 View run resilient-hound-142 at: http://127.0.0.1:5000/#/experiments/2/runs/4038926c3817460e96f0d86fe62b23da
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2
🏃 View run debonair-robin-458 at: http://127.0.0.1:5000/#/experiments/2/runs/97140c9ef24c44dc95479c501ad5578c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2
🏃 View run welcoming-doe-670 at: http://127.0.0.1:5000/#/experiments/2/runs/3b4a57de880c419b8d4af079f75b7a56
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


Successfully registered model 'nyc-taxi-random-forest-best-rmse'.
2025/05/22 00:56:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-random-forest-best-rmse, version 1
Created version '1' of model 'nyc-taxi-random-forest-best-rmse'.
