In [1]:
import pandas as pd
import numpy as np

import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import mlflow

In [2]:
## Number 1

print('The mlflow version is:', mlflow.__version__)

The mlflow version is: 2.3.2


In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
jan_df = read_dataframe('data/yellow_tripdata_2022-01.parquet')
feb_df = read_dataframe('data/yellow_tripdata_2022-02.parquet')

In [5]:
jan_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,17.816667
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,8.4
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,8.966667
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,10.033333
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,37.533333


In [4]:
import xgboost as xgb

dv = DictVectorizer()

def convert(jan_df, fit=False):
    categorical = ['PULocationID', 'DOLocationID']
    target='duration'
    
    X_train=jan_df[categorical].to_dict(orient='records')
    
    if fit:
        X_train = dv.fit_transform(X_train)
    else:
        X_train = dv.transform(X_train)
        
    Y_train=jan_df[target].values
    
    return X_train, Y_train

In [5]:
X_train, Y_train = convert(jan_df, fit=True)
X_val, Y_val = convert(feb_df)

In [6]:
mlflow.set_tracking_uri('sqlite:///backend.db')

In [23]:
mlflow.search_experiments()

[<Experiment: artifact_location='/home/moses/Documents/mlops-zoomcamp/week2/mlruns/1', creation_time=1685128064165, experiment_id='1', last_update_time=1685128064165, lifecycle_stage='active', name='nyc-taxi-experiment4', tags={}>,
 <Experiment: artifact_location='/home/moses/Documents/mlops-zoomcamp/week2/model/0', creation_time=1685126439645, experiment_id='0', last_update_time=1685126439645, lifecycle_stage='active', name='Default', tags={}>]

In [18]:
mlflow.set_experiment('nyc-taxi-experiment4')

2023/05/26 20:07:44 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment4' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/moses/Documents/mlops-zoomcamp/week2/mlruns/1', creation_time=1685128064165, experiment_id='1', last_update_time=1685128064165, lifecycle_stage='active', name='nyc-taxi-experiment4', tags={}>

In [7]:
# mlflow.set_tag("Developer", "Moses Daudu")
# mlflow.set_tag("model", "xgboost")
# mlflow.set_tag("section", 'third sect')

train = xgb.DMatrix(X_train, label=Y_train)
valid = xgb.DMatrix(X_val, label=Y_val)


## Automatic Logging with Mlflow

In [21]:
params = {
    'learning_rate' : 0.6620269443945731,
    'max_depth' : 75,
    'min_child_weight' : 0.6681121434468128,
    'objective' : 'reg:linear',
    'reg_alpha' : 0.009835858863989126,
    'reg_lambda' : 0.23315101108258487,
    'seed' : 42
}

with mlflow.start_run():
    
    mlflow.log_params(params)
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5

    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(Y_val, y_pred, squared=False)
    mlflow.log_metric('Rmse-xgb', rmse)
    
    with open('xgb_model.bin', 'wb') as f_out:
        pickle.dump((dv, booster), f_out)
    
    mlflow.xgboost.log_model(booster, 'artifact-model')


[0]	validation-rmse:9.16360


In [13]:
y_pred = booster.predict(valid)
rmse = mean_squared_error(Y_val, y_pred, squared=False)
print(rmse)

5.877335480827809


In [20]:
mlflow.xgboost.autolog(disable=True)

In [None]:

boster = fmin()