# Install libraries

In [1]:
!python --version

Python 3.9.19


In [2]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

#from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [4]:
import pickle

In [5]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment") # MLflow will assign all the runs to this experiment

<Experiment: artifact_location='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/1', creation_time=1716042712622, experiment_id='1', last_update_time=1716042712622, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

# Read in data

In [6]:
path_to_data_green_jan_2021 = './data/green_tripdata_2021-01.parquet'
path_to_data_green_feb_2021 = './data/green_tripdata_2021-02.parquet'

In [7]:
# Read in the data
df = pd.read_parquet(path_to_data_green_jan_2021)

In [8]:
df.head(2)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75


# Pre-process the data

In [9]:
# Convert pickup and dropoff columns from str to datetime
df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime']) 
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

# Create the duration column
df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']

# Get the duration in minutes
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Feature Engineering

In [10]:
# Specify categorical and numerical variables
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

In [11]:
# DictVectorizer requires a dictionary as an input. 
# Convert variables into a dictionary
train_dicts = df[categorical + numerical].to_dict(orient='records')

# Create feature matrix that will be used to train the model
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Create the target variable values
target = 'duration'
y_train = df[target].values

# Fit Linear Regression model

In [12]:
# Fit the model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Create predictions on the training set
y_pred = lr.predict(X_train)

In [13]:
# Visualise predictions vs actual data
# sns.distplot(y_pred, label='prediction')
# sns.distplot(y_train, label='actual')

# plt.legend()
# plt.show();

# Evaluate the fitted model

In [14]:
# Calculate the error
# squared=False calculates the root of the squared error
# mean_squared_error(y_train, y_pred, squared=False) # deprecated

root_mean_squared_error(y_train, y_pred)

# On average the model is wrong by ~9 minutes

9.827368941909368

# Automating the Pre-processing Step

In [15]:
def read_dataframe(filename):
    # Read in the data
    df = pd.read_parquet(filename)
    
    # Create a pre-processing function
    # Convert pickup and dropoff columns from str to datetime
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime']) 
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

    # Create the duration column
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']

    # Get the duration in minutes
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Specify categorical and numerical variables
    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)
    
    return df

# Model Validation

In [16]:
# Create train and validation datasets
df_train = read_dataframe(path_to_data_green_jan_2021)
df_val = read_dataframe(path_to_data_green_feb_2021)

In [17]:
# Create a new feature - PU and DO combination
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [18]:
categorical = ['PU_DO'] # ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # Why do we only do transform here?

In [19]:
# Create the target variable values
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [20]:
# Fit the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Create predictions on the validation set
y_pred = lr.predict(X_val)

# Calculate the error
# mean_squared_error(y_val, y_pred, squared=False) # deprecated
root_mean_squared_error(y_val, y_pred)

7.758715203341164

In [21]:
# Save the model
# wb mode = write binary
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [22]:
# Fit the Lasso Linear Regression model
lr = Lasso(0.01)
lr.fit(X_train, y_train)

# Create predictions on the validation set
y_pred = lr.predict(X_val)

# Calculate the error
# mean_squared_error(y_val, y_pred, squared=False) # deprecated
root_mean_squared_error(y_val, y_pred)

11.167275941179728

In [23]:
# Let us now start using MLflow to track our experiments

with mlflow.start_run():

    # Start logging the information about this run with MLflow
    mlflow.set_tag("developer", "Timur Musin")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.parquet")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    # Fit the Lasso Linear Regression model
    lr = Lasso(0.01)
    lr.fit(X_train, y_train)

    # Create predictions on the validation set
    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)


# Training and Optimising an xgboost with MLflow

In [24]:
# See 02-experiment-tracking-with-mlflow.md for more details

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [25]:
# Specify the training and validation datasets in a specific xbboost format

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [26]:
# Define the objective function that we are going to optimise
def objective(params):
    """
    params: set of parameters from xgboost
    """

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster = xgb.train(
            params=params, # Parameters used to train the xgboost model
            dtrain=train, # Training data
            num_boost_round=1000, # Maximum number of iteration for optimisation
            evals=[(valid, 'validation')], # Use validation set to evaluate the model; xgboost will try to minimise the error on this validation set
            early_stopping_rounds=50 # If there are 50 or more iteration without the improvement in the error on the validation set, stop the training
        )
        
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [27]:
# Define the search space
# Search space is the ranges within which we are going to search for the optimal parameters
# Check the documentation for hyperopt for more details

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0), # values between exp(-3) and exp(0) - [0.05, 1]
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [28]:
# Once the objective function and search space have been defined, we pass this information to the fmin function

# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

In [27]:
# The lowest rmse is obtained with the following set of parameters:
# learning_rate 0.055706076730278956
# max_depth 51
# min_child_weight 1.6187547995227178
# objective reg:linear
# reg_alpha 0.32274544719931225
# reg_lambda 0.04790325639568342
# seed 42

In [28]:
# Params of the best model

params = {
'learning_rate': '0.055706076730278956',
'max_depth': '51',
'min_child_weight': '1.6187547995227178',
'objective': 'reg:linear',
'reg_alpha': '0.32274544719931225',
'reg_lambda': '0.04790325639568342',
'seed': '42'
}

# We want to train the model once again with these params. 
# We also want to save the model and log the training results with MLflow
# To do this, we can either use the logging process as above
# Or with some frameworks (e.g., scikit-learn, tensorflow and keras, etc.), we can use Autologging (check the documentation for a full list of frameworks that support autologging)

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params, # Parameters used to train the xgboost model
    dtrain=train, # Training data
    num_boost_round=1000, # Maximum number of iteration for optimisation
    evals=[(valid, 'validation')], # Use validation set to evaluate the model; xgboost will try to minimise the error on this validation set
    early_stopping_rounds=50 # If there are 50 or more iteration without the improvement in the error on the validation set, stop the training
)

2024/05/18 19:54:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'afff25af3f044ed2baec05f930a563c5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:11.75905
[1]	validation-rmse:11.33769
[2]	validation-rmse:10.94704
[3]	validation-rmse:10.58454
[4]	validation-rmse:10.25026
[5]	validation-rmse:9.94004
[6]	validation-rmse:9.65475
[7]	validation-rmse:9.39031
[8]	validation-rmse:9.14682
[9]	validation-rmse:8.92313
[10]	validation-rmse:8.71716
[11]	validation-rmse:8.52882
[12]	validation-rmse:8.35583
[13]	validation-rmse:8.19850
[14]	validation-rmse:8.05287
[15]	validation-rmse:7.92051
[16]	validation-rmse:7.79938
[17]	validation-rmse:7.68868
[18]	validation-rmse:7.58764
[19]	validation-rmse:7.49653
[20]	validation-rmse:7.41266
[21]	validation-rmse:7.33450
[22]	validation-rmse:7.26341
[23]	validation-rmse:7.19916
[24]	validation-rmse:7.13992
[25]	validation-rmse:7.08688
[26]	validation-rmse:7.03710
[27]	validation-rmse:6.99287
[28]	validation-rmse:6.95160
[29]	validation-rmse:6.91478
[30]	validation-rmse:6.88000
[31]	validation-rmse:6.84913
[32]	validation-rmse:6.82114
[33]	validation-rmse:6.79443
[34]	validation-rms

