In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [2]:
import pickle

In [3]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [4]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_duration_first_exp")

<Experiment: artifact_location='/workspaces/Arrival_time_estimation/mlflow/mlruns/1', creation_time=1724119947795, experiment_id='1', last_update_time=1724119947795, lifecycle_stage='active', name='nyc_taxi_duration_first_exp', tags={}>

### data summary
- There are 68211 total records, after filtering by duration > 1 & <=60 there will be 65924 records, which is 96% of the data

In [5]:
def read_and_preprocess(filename):
    df = pd.read_parquet(filename)
    
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    
    #categorical = ['PU_DO']#'DOLocationID',]
    categorical = ['PULocationID','DOLocationID']#'DOLocationID',]
    numerical = ['trip_distance']
    
    df[categorical]= df[categorical].astype(str)
    df = df[(df.duration > 1) & (df.duration <=60)]
    return df

In [6]:
df_train = read_and_preprocess('../data/green_tripdata_2023-01.parquet')
df_valid = read_and_preprocess('../data/green_tripdata_2023-02.parquet')

In [7]:
len(df_train),len(df_valid)

(65924, 62547)

### Training pipeline

In [8]:
dv = DictVectorizer()

categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_valid[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dict)

In [9]:

target = 'duration'
y_train = df_train[target].values
y_valid = df_valid[target].values

In [10]:

#training
Lr = LinearRegression()
Lr.fit(X_train,y_train)
#prediction
y_pred = Lr.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.352385182267624

##### dump model to pickle

In [None]:
with open('../models/lin_Reg.bin', 'wb') as f_out:
    pickle.dump((dv,Lr), f_out)

## with Lasso Reg

In [11]:
#With Lasso
with mlflow.start_run():
    
    mlflow.set_tag("developer","Merhawi")
    
    mlflow.log_param("train_data_path","../data/green_tripdata_2023-01.parquet")
    mlflow.log_param("valid_data_path","../data/green_tripdata_2023-02.parquet")
    
    alpha = 0.01
    mlflow.log_param("alpha",alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_train,y_train)
    #prediction
    y_pred = lr.predict(x_val)
    #evaluation
    RMSE = mean_squared_error(y_valid,y_pred, squared=False)
    mlflow.log_metric("Rmse",RMSE)

In [12]:
#With Ridge
LR = Ridge()
LR.fit(X_train,y_train)
#prediction
y_pred = LR.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.382906622848506

In [17]:
import xgboost as xgb

from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from hyperopt.pyll import scope

In [18]:
train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(x_val, label = y_valid)

### error hint
- simple adding comas between can add error with "got tuple", 
- Carefully inspect the lines

In [19]:

def objective(params):
    
    with mlflow.start_run():
        
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, "validation")],
            early_stopping_rounds = 50
        )
        
        y_pred = booster.predict(valid)

        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        mlflow.log_metric("rmse",rmse)
        
    return {"loss": rmse, "status": STATUS_OK}

In [None]:
search_space = {
    "max_depth": scope.int(hp.quniform('max_depth', 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0), # this range means exp(-3),exp(0) -> [0.05,1.0]
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6,-1),
    "min_child_weight": hp.loguniform("min_child_weight", -1,3),
    "objective": 'reg:linear',
    "seed":42
    }

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 5,
    trials = Trials()
)

In [22]:
params = {
    "learning_rate": 0.2787670044958393,
    "max_depth":6,
    "min_child_weight": 6.387869017513505,
    "objective" :'reg:linear',
    "reg_alpha" : 0.25974368153512745,
    "reg_lambda": 0.06449934239324491,
    "seed": 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, "validation")],
            early_stopping_rounds = 50
        )

2024/08/21 02:24:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '93d2948f6bd6490abe0c2c21472ef695', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:7.76462
[1]	validation-rmse:6.79370
[2]	validation-rmse:6.21753
[3]	validation-rmse:5.87489
[4]	validation-rmse:5.66942
[5]	validation-rmse:5.55009
[6]	validation-rmse:5.47538
[7]	validation-rmse:5.42258
[8]	validation-rmse:5.39095
[9]	validation-rmse:5.36653
[10]	validation-rmse:5.34390
[11]	validation-rmse:5.32953
[12]	validation-rmse:5.31951
[13]	validation-rmse:5.30785
[14]	validation-rmse:5.29930




[15]	validation-rmse:5.28284
[16]	validation-rmse:5.27629
[17]	validation-rmse:5.26893
[18]	validation-rmse:5.26123
[19]	validation-rmse:5.25664
[20]	validation-rmse:5.24870
[21]	validation-rmse:5.24493
[22]	validation-rmse:5.24065
[23]	validation-rmse:5.23583
[24]	validation-rmse:5.22627
[25]	validation-rmse:5.22208
[26]	validation-rmse:5.21554
[27]	validation-rmse:5.20436
[28]	validation-rmse:5.20077
[29]	validation-rmse:5.19770
[30]	validation-rmse:5.19448
[31]	validation-rmse:5.18821
[32]	validation-rmse:5.18399
[33]	validation-rmse:5.17918
[34]	validation-rmse:5.17713
[35]	validation-rmse:5.17544
[36]	validation-rmse:5.17274
[37]	validation-rmse:5.16828
[38]	validation-rmse:5.16460
[39]	validation-rmse:5.15823
[40]	validation-rmse:5.15547
[41]	validation-rmse:5.15187
[42]	validation-rmse:5.14769
[43]	validation-rmse:5.14179
[44]	validation-rmse:5.13933
[45]	validation-rmse:5.13816
[46]	validation-rmse:5.13609
[47]	validation-rmse:5.13146
[48]	validation-rmse:5.13098
[49]	validatio

