In [11]:
import pandas as pd
import numpy as np
import pickle
import pyarrow

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [4]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green_taxi_models_zm")

<Experiment: artifact_location='gs://mlflow-backend-models/1', creation_time=1728346178908, experiment_id='1', last_update_time=1728346178908, lifecycle_stage='active', name='green_taxi_models_zm', tags={}>

In [8]:
def read_dataframe(filename:str):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration > 1) & (df.duration <=60)]

    categorical = ['PULocationID','DOLocationID']
    df[categorical]= df[categorical].astype(str)
    return df

def prepare_dictionaries(df: pd.DataFrame):    
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [10]:
df_train = read_dataframe('../data/green_tripdata_2023-01.parquet')
df_valid = read_dataframe('../data/green_tripdata_2023-02.parquet')

target = 'duration'
y_train = df_train[target].values
y_valid = df_valid[target].values

dict_train = prepare_dictionaries(df_train)
dict_val = prepare_dictionaries(df_valid)

In [None]:
with mlflow.start_run():
    params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)
    mlflow.log_params(params)

    dv = DictVectorizer()
    model = RandomForestRegressor(**params, n_jobs= -1)
    
    X_train = dv.fit_transform(dict_train)
    model.fit(X_train, y_train)
    
    X_val = dv.transform(dict_val)
    y_pred = model.predict(X_val)

    rmse = root_mean_squared_error(y_pred, y_valid)
    print(params, rmse)
    mlflow.log_metric('rmse', rmse)

    mlflow.sklearn.log_model(model, artifact_path="model")

    with open('dict_vectorizer.bin', 'wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact('dict_vectorizer,bin')
    