In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [3]:
import pickle
import xgboost as xgb

from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from hyperopt.pyll import scope

In [4]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [5]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_duration_first_exp")

<Experiment: artifact_location='/workspaces/Arrival_time_estimation/mlflow/mlruns/1', creation_time=1724119947795, experiment_id='1', last_update_time=1724119947795, lifecycle_stage='active', name='nyc_taxi_duration_first_exp', tags={}>

### data summary
- There are 68211 total records, after filtering by duration > 1 & <=60 there will be 65924 records, which is 96% of the data

In [6]:
def read_and_preprocess(filename):
    df = pd.read_parquet(filename)
    
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    
    #categorical = ['PU_DO']#'DOLocationID',]
    categorical = ['PULocationID','DOLocationID']#'DOLocationID',]
    numerical = ['trip_distance']
    
    df[categorical]= df[categorical].astype(str)
    df = df[(df.duration > 1) & (df.duration <=60)]
    return df

In [7]:
df_train = read_and_preprocess('../data/green_tripdata_2023-01.parquet')
df_valid = read_and_preprocess('../data/green_tripdata_2023-02.parquet')

In [8]:
len(df_train),len(df_valid)

(65924, 62547)

### Training pipeline

In [9]:
dv = DictVectorizer()

categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_valid[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dict)

In [10]:

target = 'duration'
y_train = df_train[target].values
y_valid = df_valid[target].values

In [11]:

#training
Lr = LinearRegression()
Lr.fit(X_train,y_train)
#prediction
y_pred = Lr.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.352385182267624

##### dump model to pickle

In [None]:
with open('../models/lin_Reg.bin', 'wb') as f_out:
    pickle.dump((dv,Lr), f_out)

## with Lasso Reg

In [14]:
#With Lasso
with mlflow.start_run():
    
    mlflow.set_tag("developer","Merhawi")
    
    mlflow.log_param("train_data_path","../data/green_tripdata_2023-01.parquet")
    mlflow.log_param("valid_data_path","../data/green_tripdata_2023-02.parquet")
    
    alpha = 0.01
    mlflow.log_param("alpha",alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_train,y_train)
    #prediction
    y_pred = lr.predict(x_val)
    #evaluation
    RMSE = mean_squared_error(y_valid,y_pred, squared=False)
    mlflow.log_metric("Rmse",RMSE)
    
    mlflow.log_artifact(local_path="../models/lin_Reg.bin", artifact_path="models_pickle")

In [17]:
#With Ridge
LR = Ridge()
LR.fit(X_train,y_train)
#prediction
y_pred = LR.predict(x_val)
#evaluation
mean_squared_error(y_valid,y_pred, squared=False)

7.382906622848506

In [12]:
train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(x_val, label = y_valid)

### error hint
- simple adding comas between can add error with "got tuple", 
- Carefully inspect the lines

In [19]:

def objective(params):
    
    with mlflow.start_run():
        
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, "validation")],
            early_stopping_rounds = 50
        )
        
        y_pred = booster.predict(valid)

        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        mlflow.log_metric("rmse",rmse)
        
    return {"loss": rmse, "status": STATUS_OK}

In [None]:
search_space = {
    "max_depth": scope.int(hp.quniform('max_depth', 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0), # this range means exp(-3),exp(0) -> [0.05,1.0]
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6,-1),
    "min_child_weight": hp.loguniform("min_child_weight", -1,3),
    "objective": 'reg:linear',
    "seed":42
    }

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 5,
    trials = Trials()
)

In [None]:
params = {
    "learning_rate": 0.2787670044958393,
    "max_depth":6,
    "min_child_weight": 6.387869017513505,
    "objective" :'reg:linear',
    "reg_alpha" : 0.25974368153512745,
    "reg_lambda": 0.06449934239324491,
    "seed": 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, "validation")],
            early_stopping_rounds = 50
        )
y_pred = booster.predict(valid)

rmse = mean_squared_error(y_valid, y_pred, squared=False)
mlflow.log_metric("rmse",rmse)

In [25]:
mlflow.xgboost.autolog(disable=True)
import os
os.makedirs("models", exist_ok=True)

In [None]:
with mlflow.start_run():
    best_params = {
    "learning_rate": 0.2787670044958393,
    "max_depth":6,
    "min_child_weight": 6.387869017513505,
    "objective" :'reg:linear',
    "reg_alpha" : 0.25974368153512745,
    "reg_lambda": 0.06449934239324491,
    "seed": 42
    }

    
    mlflow.log_params(best_params)

    booster = xgb.train(
                params = best_params,
                dtrain = train,
                num_boost_round = 1000,
                evals = [(valid, "validation")],
                early_stopping_rounds = 50
            )
    y_pred = booster.predict(valid)

    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    mlflow.log_metric("rmse",rmse)
    
    with open("models/preprocessor.b", "wb")as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("models/preprocessor.b", artifact_path= "preprocessor")
    
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

In [32]:
#loading and validating model before deployment
# as a python function
logged_model = 'runs:/5cde0dbba36e4d55b9d72b2f88c561c2/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [28]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 5cde0dbba36e4d55b9d72b2f88c561c2

In [31]:
#as an xgboost object
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [30]:
xgboost_model

<xgboost.core.Booster at 0x793aaeae6f10>

In [33]:
y_pre_xgb = xgboost_model.predict(valid)

In [34]:
y_pre_xgb[:10]

array([24.479042, 22.26141 , 29.449394, 18.56988 , 13.519386, 12.868946,
       10.607654,  8.320804, 13.055486,  8.200915], dtype=float32)

## with Gradient boosting regressor

In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
gradient_b_params = {
    "learning_rate": 0.2787670044958393,
    "max_depth": 6,
    "min_samples_split": 6,
    "alpha": 0.25974368153512745,
    "lambda": 0.06449934239324491,
    "random_state": 42
}


    
with mlflow.start_run():
    params=gradient_b_params
    model_type='gradient_boosting'

    mlflow.set_tag('model', model_type)
    mlflow.sklearn.autolog()
    mlflow.log_params(params)

    #model_type == 'gradient_boosting':
    model = GradientBoostingRegressor(**params)
    model.fit(train_X, train_y)
    y_pred = model.predict(valid_X)

    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

return {"loss": rmse, "status": STATUS_OK}

#     with open("models/preprocessor.b", "wb")as f_out:
#         pickle.dump(dv, f_out)

#     mlflow.log_artifact("models/preprocessor.b", artifact_path= "preprocessor")

#     mlflow.sklearn.log_model(booster, artifact_path="models_mlflow")
