In [1]:
import pandas as pd
import numpy as np

import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import xgboost as xgb

import mlflow

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
jan_df = read_dataframe('data/yellow_tripdata_2022-01.parquet')
feb_df = read_dataframe('data/yellow_tripdata_2022-02.parquet')

In [4]:
dv = DictVectorizer()

categorical = ['PULocationID', 'DOLocationID']
target='duration'

X_train=jan_df[categorical].to_dict(orient='records')

X_train = dv.fit_transform(X_train)

Y_train=jan_df[target].values

X_val=feb_df[categorical].to_dict(orient='records')

X_val = dv.transform(X_val)

Y_val=feb_df[target].values

In [5]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [8]:
Y_pred = lr.predict(X_train)
mean_squared_error(Y_train, Y_pred, squared=False)

6.986191065500608

## Saving The Model and Preprocessor using Mlflow

In [5]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment1')

2023/05/24 12:48:13 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment1' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/moses/Documents/mlops-zoomcamp/week2/mlruns/1', creation_time=1684928893026, experiment_id='1', last_update_time=1684928893026, lifecycle_stage='active', name='nyc-taxi-experiment1', tags={}>

In [6]:
train=xgb.DMatrix(X_train, label=Y_train)
valid=xgb.DMatrix(X_val, label=Y_val)

In [12]:
with mlflow.start_run():
    params = {
        'learning_rate' : 0.6620269443945731,
        'max_depth' : 75,
        'min_child_weight' : 0.6681121434468128,
        'objective' : 'reg:linear',
        'reg_alpha' : 0.009835858863989126,
        'reg_lambda' : 0.23315101108258487,
        'seed' : 42
    }

    mlflow.log_params(params)
    
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5

    )
    
    y_pred = booster.predict(valid)
    rmse=mean_squared_error(Y_val, y_pred, squared=False)
    mlflow.log_metric("Rmse", rmse)
    
    with open('model/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact('model/preprocessor.b', artifact_path='preprocessor')
    
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')



[0]	validation-rmse:9.16360
[1]	validation-rmse:7.05314
[2]	validation-rmse:6.46828
[3]	validation-rmse:5.99469
[4]	validation-rmse:5.87734
[5]	validation-rmse:5.81240
[6]	validation-rmse:5.77935
[7]	validation-rmse:5.52859
[8]	validation-rmse:5.52441
[9]	validation-rmse:5.51751
[10]	validation-rmse:5.50425
[11]	validation-rmse:5.47031
[12]	validation-rmse:5.46145
[13]	validation-rmse:5.45956
[14]	validation-rmse:5.45825
[15]	validation-rmse:5.45657
[16]	validation-rmse:5.43942
[17]	validation-rmse:5.43861
[18]	validation-rmse:5.43531
[19]	validation-rmse:5.43527
[20]	validation-rmse:5.43297
[21]	validation-rmse:5.43116
[22]	validation-rmse:5.43089
[23]	validation-rmse:5.42923
[24]	validation-rmse:5.42920
[25]	validation-rmse:5.42796
[26]	validation-rmse:5.42780
[27]	validation-rmse:5.42733
[28]	validation-rmse:5.42704
[29]	validation-rmse:5.42701
[30]	validation-rmse:5.42696
[31]	validation-rmse:5.42635
[32]	validation-rmse:5.42620
[33]	validation-rmse:5.42606
[34]	validation-rmse:5.4

## Saving the model as an artifact

In [24]:

with mlflow.start_run():
    params = {
        'learning_rate' : 0.6620269443945731,
        'max_depth' : 75,
        'min_child_weight' : 0.6681121434468128,
        'objective' : 'reg:linear',
        'reg_alpha' : 0.009835858863989126,
        'reg_lambda' : 0.23315101108258487,
        'seed' : 42
    }

    mlflow.log_params(params)
    
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=10,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5

    )
    
    y_pred = booster.predict(valid)
    rmse=mean_squared_error(Y_val, y_pred, squared=False)
    mlflow.log_metric("Rmse", rmse)
    
    with open('model/lin_model.bin', 'wb') as f_out:
        pickle.dump(booster, f_out)
    
    mlflow.log_artifact('model/lin_model.bin', artifact_path='xgboost_model')


[0]	validation-rmse:9.16360
[1]	validation-rmse:7.05314
[2]	validation-rmse:6.46828
[3]	validation-rmse:5.99469
[4]	validation-rmse:5.87734
[5]	validation-rmse:5.81240
[6]	validation-rmse:5.77935
[7]	validation-rmse:5.52859
[8]	validation-rmse:5.52441
[9]	validation-rmse:5.51751
