In [11]:
import pandas as pd
import numpy as np
import pickle
import pyarrow

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline

In [39]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green_taxi_model_new")

<Experiment: artifact_location='gs://mlflow-backend-storage-models/1', creation_time=1728422822438, experiment_id='1', last_update_time=1728422822438, lifecycle_stage='active', name='green_taxi_model_new', tags={}>

In [13]:
import mlflow
import os
from google.oauth2 import service_account

# Set up credentials with proper scopes
credentials = service_account.Credentials.from_service_account_file(
    '/home/MERHAWI/arrival_time/Arrival_time_estimation/mlflow_cld/my_gcp_k.json',
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

# Configure MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/MERHAWI/arrival_time/Arrival_time_estimation/mlflow_cld/my_gcp_k.json'

In [14]:
def read_dataframe(filename:str):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration > 1) & (df.duration <=60)]

    categorical = ['PULocationID','DOLocationID']
    df[categorical]= df[categorical].astype(str)
    return df

def prepare_dictionaries(df: pd.DataFrame):    
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [15]:
df_train = read_dataframe('../data/green_tripdata_2023-01.parquet')
df_valid = read_dataframe('../data/green_tripdata_2023-02.parquet')

target = 'duration'
y_train = df_train[target].values
y_valid = df_valid[target].values

dict_train = prepare_dictionaries(df_train)
dict_val = prepare_dictionaries(df_valid)

In [40]:
with mlflow.start_run():
    params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)
    mlflow.log_params(params)

    pipeline = make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params, n_jobs= -1)
    )

    pipeline.fit(dict_train, y_train)
    y_pred = pipeline.predict(dict_val)

    rmse = root_mean_squared_error(y_pred, y_valid)
    print(params, rmse)
    mlflow.log_metric('rmse', rmse)

    mlflow.sklearn.log_model(pipeline, artifact_path="model")    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 5.396698280020155


2024/10/08 22:58:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run valuable-boar-435 at: http://127.0.0.1:5000/#/experiments/1/runs/d4c48152bad344bd9bb991a7e7608484.
2024/10/08 22:58:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [27]:
import mlflow
from mlflow import MlflowClient

In [29]:


RUN_ID = "ba222bda45b94132b337915e4fa8226d"
MLFLOW_TRACKING_URI="http://127.0.0.1:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [31]:
path = client.download_artifacts(run_id=RUN_ID, path= 'dict_vectorizer.bin')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
with open(path, 'rb') as f_out:
    dv = pickle.load(f_out)

In [33]:
dv