In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [2]:
import pickle

In [3]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [4]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_duration_first_exp")

<Experiment: artifact_location='/workspaces/Arrival_time_estimation/mlflow/mlruns/1', creation_time=1724119947795, experiment_id='1', last_update_time=1724119947795, lifecycle_stage='active', name='nyc_taxi_duration_first_exp', tags={}>

### data summary
- There are 68211 total records, after filtering by duration > 1 & <=60 there will be 65924 records, which is 96% of the data

In [5]:
def read_and_preprocess(filename):
    df = pd.read_parquet(filename)
    
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    
    #categorical = ['PU_DO']#'DOLocationID',]
    categorical = ['PULocationID','DOLocationID']#'DOLocationID',]
    numerical = ['trip_distance']
    
    df[categorical]= df[categorical].astype(str)
    df = df[(df.duration > 1) & (df.duration <=60)]
    return df

In [6]:
df_train = read_and_preprocess('../data/green_tripdata_2023-01.parquet')
df_valid = read_and_preprocess('../data/green_tripdata_2023-02.parquet')

In [7]:
len(df_train),len(df_valid)

(65924, 62547)

### Training pipeline

In [8]:
dv = DictVectorizer()

categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_valid[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dict)

In [9]:

target = 'duration'
y_train = df_train[target].values
y_valid = df_valid[target].values

In [10]:

#training
Lr = LinearRegression()
Lr.fit(X_train,y_train)
#prediction
y_pred = Lr.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.352385182267624

##### dump model to pickle

In [None]:
with open('../models/lin_Reg.bin', 'wb') as f_out:
    pickle.dump((dv,Lr), f_out)

## with Lasso Reg

In [11]:
#With Lasso
with mlflow.start_run():
    
    mlflow.set_tag("developer","Merhawi")
    
    mlflow.log_param("train_data_path","../data/green_tripdata_2023-01.parquet")
    mlflow.log_param("valid_data_path","../data/green_tripdata_2023-02.parquet")
    
    alpha = 0.01
    mlflow.log_param("alpha",alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_train,y_train)
    #prediction
    y_pred = lr.predict(x_val)
    #evaluation
    RMSE = mean_squared_error(y_valid,y_pred, squared=False)
    mlflow.log_metric("Rmse",RMSE)

In [12]:
#With Ridge
LR = Ridge()
LR.fit(X_train,y_train)
#prediction
y_pred = LR.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.382906622848506

In [20]:
import xgboost as xgb

from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from hyperopt.pyll import scope

In [21]:
train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(x_val, label = y_valid)

In [28]:
def objective(params):
    
    with mlflow.start_run():
        
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster, evaluation_history = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, "validation")],
            early_stopping_rounds = 50,
        ),
        
        # Print the type of the returned object
        print("Type of booster:", type(booster))
        
        y_pred = booster.predict(valid),

        rmse = mean_squared_error(y_val, y_pred, squared=False),
        mlflow.log_metric("rmse",rmse)
        
    return {"loss": rmse, "status": STATUS_OK}

In [29]:
search_space = {
    "max_depth": scope.int(hp.quniform('max_depth', 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0), # this range means exp(-3),exp(0) -> [0.05,1.0]
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6,-1),
    "min_child_weight": hp.loguniform("min_child_weight", -1,3),
    "objective": 'reg:linear',
    "seed":42,
    }

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 50,
    trials = Trials()
)

  0%|                                                             | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:7.02681                                                                              
[1]	validation-rmse:5.96116                                                                              
[2]	validation-rmse:5.52508                                                                              
[3]	validation-rmse:5.37705                                                                              
[4]	validation-rmse:5.34013                                                                              
[5]	validation-rmse:5.32870                                                                              
[6]	validation-rmse:5.33465                                                                              
[7]	validation-rmse:5.34122                                                                              
[8]	validation-rmse:5.34829                                                                              
[9]	validation-rmse:5.35679                   

job exception: not enough values to unpack (expected 2, got 1)



  0%|                                                             | 0/50 [00:07<?, ?trial/s, best loss=?]


ValueError: not enough values to unpack (expected 2, got 1)