In [1]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials  # metrics
from hyperopt.pyll import scope

### Question 1

In [2]:
!mlflow --version

mlflow, version 2.3.2


### Question 2

In [3]:
# Make the directory to hold the data if it doesn't exist
# https://www.tutorialspoint.com/python/os_makedirs.htm
os.makedirs(os.path.dirname('./data/'), mode=0o755, exist_ok=True)

# remove files if they are already there, then get files
# !rm ./data/yellow_tripdata_2022-01.parquet
# !rm ./data/yellow_tripdata_2022-02.parquet

# windows
# https://www.freecodecamp.org/news/how-to-check-if-a-file-exists-in-python/
if os.path.isfile('./data/green_tripdata_2022-01.parquet') and \
    os.path.isfile('./data/green_tripdata_2022-02.parquet') and \
    os.path.isfile('./data/green_tripdata_2022-03.parquet'):
        os.remove("./data/green_tripdata_2022-01.parquet")
        os.remove("./data/green_tripdata_2022-02.parquet")
        os.remove("./data/green_tripdata_2022-03.parquet")
    
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet -P ./data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet -P ./data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-03.parquet -P ./data    

--2023-05-17 20:17:38--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.224.208.121, 13.224.208.35, 13.224.208.131, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.224.208.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1254291 (1.2M) [binary/octet-stream]
Saving to: './data/green_tripdata_2022-01.parquet'

     0K .......... .......... .......... .......... ..........  4% 9.27M 0s
    50K .......... .......... .......... .......... ..........  8% 47.6M 0s
   100K .......... .......... .......... .......... .......... 12% 18.1M 0s
   150K .......... .......... .......... .......... .......... 16% 15.4M 0s
   200K .......... .......... .......... .......... .......... 20% 92.7M 0s
   250K .......... .......... .......... .......... .......... 24% 23.0M 0s
   300K .......... .......... .......... .......... ........

In [4]:
# !dir
!python preprocess_data.py --raw_data_path ./data --dest_path ./output

In [5]:
!dir output

 Volume in drive C has no label.
 Volume Serial Number is 08A3-CF2D

 Directory of C:\Users\nimz\Documents\mlops_zoomcamp\week2_experiment_tracking\homework\output

05/17/2023  08:07 PM    <DIR>          .
05/17/2023  08:07 PM    <DIR>          ..
05/17/2023  08:17 PM           153,660 dv.pkl
05/17/2023  08:17 PM         2,632,817 test.pkl
05/17/2023  08:17 PM         2,146,163 train.pkl
05/17/2023  08:17 PM         2,336,393 val.pkl
               4 File(s)      7,269,033 bytes
               2 Dir(s)  360,015,130,624 bytes free


### Question 3

In [6]:
# !mlflow ui --backend-store-uri sqlite:///mlflow.db

In [7]:
# set the MLFlow URI to our backend
mlflow.set_tracking_uri('sqlite:///mlflow.db')

# set up to assign/append runs to our experiment (and create if it doesn't exist)
mlflow.set_experiment('nyc_taxi_homework_week2')

<Experiment: artifact_location='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/homework/mlruns/1', creation_time=1684368721173, experiment_id='1', last_update_time=1684368721173, lifecycle_stage='active', name='nyc_taxi_homework_week2', tags={}>

In [9]:
# def run_train(data_path: str):
#     # set the MLFlow URI to our backend
#     mlflow.set_tracking_uri('sqlite:///mlflow.db')

#     # set up to assign/append runs to our experiment (and create if it doesn't exist)
#     mlflow.set_experiment('nyc_taxi_homework_week2')    
    
#     # Turn on autologging
#     # https://mlflow.org/docs/latest/tracking.html#automatic-logging
#     # https://mlflow.org/docs/latest/python_api/mlflow.xgboost.html#mlflow.xgboost.autolog
#     mlflow.sklearn.autolog()

#     with mlflow.start_run():
#         X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
#         X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

#         rf = RandomForestRegressor(max_depth=10, random_state=0)
#         rf.fit(X_train, y_train)
#         y_pred = rf.predict(X_val)

#         rmse = mean_squared_error(y_val, y_pred, squared=False)
#         print(f'RMSE: {rmse}')

In [15]:
!python train.py --data_path ./output

RMSE: 2.453983836538874




### Question 4

In [3]:
# updated objective function
def objective(trial):
    with mlflow.start_run():
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
            'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
            'random_state': 42,
            'n_jobs': -1
        }

        # log parameters
        mlflow.log_params(params)

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        # Log RMSE metric
        mlflow.log_metric('rmse', rmse)

        return rmse

#### Check http://127.0.0.1:5000/#/experiments/ for random-forest-hyperopt experiment to see the RMSE values for 10 runs