In [2]:
!python -V

Python 3.12.11


In [3]:
import pandas as pd

In [4]:
import pickle

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [7]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/07/23 12:02:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/23 12:02:33 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/ai-infra-journey/mlops-training/mlruns/1', creation_time=1753266959502, experiment_id='1', last_update_time=1753266959502, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [9]:
df_sample = pd.read_parquet('./data/green_tripdata_2025-01.parquet')
df_sample.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
cbd_congestion_fee              float64
dtype: object

In [10]:
df_train = read_dataframe('./data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2025-02.parquet')

In [11]:
df_train.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                     object
DOLocationID                     object
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
cbd_congestion_fee              float64
duration                        float64
dtype: object

In [12]:
len(df_train), len(df_val)

(46307, 44218)

In [13]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [14]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [15]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

37.19629658380478

In [17]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [18]:
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [19]:
import xgboost as xgb

In [20]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [21]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [22]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [23]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.54239                           
[1]	validation-rmse:8.05524                           
[2]	validation-rmse:7.63739                           
[3]	validation-rmse:7.28044                           
[4]	validation-rmse:6.97577                           
[5]	validation-rmse:6.71820                           
[6]	validation-rmse:6.49992                           
[7]	validation-rmse:6.31736                           
[8]	validation-rmse:6.16367                           
[9]	validation-rmse:6.03462                           
[10]	validation-rmse:5.92636                          
[11]	validation-rmse:5.83684                          
[12]	validation-rmse:5.76148                          
[13]	validation-rmse:5.69905                          
[14]	validation-rmse:5.64654                          
[15]	validation-rmse:5.60341                          
[16]	validation-rmse:5.56584                          
[17]	validation-rmse:5.53522                          
[18]	valid

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.94527                                                     
[1]	validation-rmse:5.57952                                                     
[2]	validation-rmse:5.50564                                                     
[3]	validation-rmse:5.48484                                                     
[4]	validation-rmse:5.47778                                                     
[5]	validation-rmse:5.46762                                                     
[6]	validation-rmse:5.44529                                                     
[7]	validation-rmse:5.43668                                                     
[8]	validation-rmse:5.43057                                                     
[9]	validation-rmse:5.42842                                                     
[10]	validation-rmse:5.42729                                                    
[11]	validation-rmse:5.42233                                                    
[12]	validation-rmse:5.42124

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:8.39988                                                     
[2]	validation-rmse:8.09741                                                     
[3]	validation-rmse:7.82386                                                     
[4]	validation-rmse:7.57838                                                     
[5]	validation-rmse:7.35643                                                     
[6]	validation-rmse:7.15866                                                     
[7]	validation-rmse:6.97963                                                     
[8]	validation-rmse:6.81938                                                     
[9]	validation-rmse:6.68028                                                     
[10]	validation-rmse:6.55147                                                    
[11]	validation-rmse:6.43921                                                    
[12]	validation-rmse:6.34204                                                    
[13]	validation-rmse:6.25099

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:5.55259                                                     
[2]	validation-rmse:5.54570                                                     
[3]	validation-rmse:5.53363                                                     
[4]	validation-rmse:5.52468                                                     
[5]	validation-rmse:5.51474                                                     
[6]	validation-rmse:5.51477                                                     
[7]	validation-rmse:5.50754                                                     
[8]	validation-rmse:5.50008                                                     
[9]	validation-rmse:5.49756                                                     
[10]	validation-rmse:5.49299                                                    
[11]	validation-rmse:5.48869                                                    
[12]	validation-rmse:5.48364                                                    
[13]	validation-rmse:5.48172

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.49069                                                     
[1]	validation-rmse:5.44936                                                     
[2]	validation-rmse:5.43301                                                     
[3]	validation-rmse:5.42686                                                     
[4]	validation-rmse:5.41454                                                     
[5]	validation-rmse:5.40510                                                     
[6]	validation-rmse:5.40620                                                     
[7]	validation-rmse:5.40795                                                     
[8]	validation-rmse:5.40326                                                     
[9]	validation-rmse:5.39859                                                     
[10]	validation-rmse:5.39716                                                    
[11]	validation-rmse:5.39605                                                    
[12]	validation-rmse:5.39152

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.27900                                                     
[1]	validation-rmse:7.62950                                                     
[2]	validation-rmse:7.12540                                                     
[3]	validation-rmse:6.73815                                                     
[4]	validation-rmse:6.44439                                                     
[5]	validation-rmse:6.22310                                                     
[6]	validation-rmse:6.05703                                                     
[7]	validation-rmse:5.93205                                                     
[8]	validation-rmse:5.83845                                                     
[9]	validation-rmse:5.76826                                                     
[10]	validation-rmse:5.71403                                                    
[11]	validation-rmse:5.67232                                                    
[12]	validation-rmse:5.63907

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.79714                                                     
[1]	validation-rmse:8.51058                                                     
[2]	validation-rmse:8.24550                                                     
[3]	validation-rmse:8.00029                                                     
[4]	validation-rmse:7.77430                                                     
[5]	validation-rmse:7.56578                                                     
[6]	validation-rmse:7.37369                                                     
[7]	validation-rmse:7.19685                                                     
[8]	validation-rmse:7.03500                                                     
[9]	validation-rmse:6.88662                                                     
[10]	validation-rmse:6.75074                                                    
[11]	validation-rmse:6.62673                                                    
[12]	validation-rmse:6.51320

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:7.08265                                                     
[2]	validation-rmse:6.53963                                                     
[3]	validation-rmse:6.19425                                                     
[4]	validation-rmse:5.97143                                                     
[5]	validation-rmse:5.82887                                                     
[6]	validation-rmse:5.73222                                                     
[7]	validation-rmse:5.67239                                                     
[8]	validation-rmse:5.62503                                                     
[9]	validation-rmse:5.59644                                                     
[10]	validation-rmse:5.57734                                                    
[11]	validation-rmse:5.55916                                                    
[12]	validation-rmse:5.54846                                                    
[13]	validation-rmse:5.54134

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:7.55123                                                     
[2]	validation-rmse:7.03788                                                     
[3]	validation-rmse:6.64713                                                     
[4]	validation-rmse:6.36120                                                     
[5]	validation-rmse:6.15305                                                     
[6]	validation-rmse:5.99697                                                     
[7]	validation-rmse:5.88102                                                     
[8]	validation-rmse:5.79178                                                     
[9]	validation-rmse:5.73019                                                     
[10]	validation-rmse:5.68011                                                    
[11]	validation-rmse:5.64778                                                    
[12]	validation-rmse:5.61445                                                    
[13]	validation-rmse:5.59215

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.81865                                                     
[1]	validation-rmse:8.55133                                                     
[2]	validation-rmse:8.30329                                                     
[3]	validation-rmse:8.07323                                                     
[4]	validation-rmse:7.86002                                                     
[5]	validation-rmse:7.66419                                                     
[6]	validation-rmse:7.48216                                                     
[7]	validation-rmse:7.31480                                                     
[8]	validation-rmse:7.15901                                                     
[9]	validation-rmse:7.01605                                                     
[10]	validation-rmse:6.88484                                                    
[11]	validation-rmse:6.76258                                                    
[12]	validation-rmse:6.65068

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.91657                                                      
[1]	validation-rmse:5.52421                                                      
[2]	validation-rmse:5.46294                                                      
[3]	validation-rmse:5.44474                                                      
[4]	validation-rmse:5.43129                                                      
[5]	validation-rmse:5.40574                                                      
[6]	validation-rmse:5.40191                                                      
[7]	validation-rmse:5.39137                                                      
[8]	validation-rmse:5.39176                                                      
[9]	validation-rmse:5.39146                                                      
[10]	validation-rmse:5.39079                                                     
[11]	validation-rmse:5.38439                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.80063                                                      
[1]	validation-rmse:8.51722                                                      
[2]	validation-rmse:8.25606                                                      
[3]	validation-rmse:8.01598                                                      
[4]	validation-rmse:7.79732                                                      
[5]	validation-rmse:7.59275                                                      
[6]	validation-rmse:7.40721                                                      
[7]	validation-rmse:7.23810                                                      
[8]	validation-rmse:7.07849                                                      
[9]	validation-rmse:6.93835                                                      
[10]	validation-rmse:6.80963                                                     
[11]	validation-rmse:6.68951                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.36441                                                      
[1]	validation-rmse:7.75664                                                      
[2]	validation-rmse:7.26460                                                      
[3]	validation-rmse:6.86631                                                      
[4]	validation-rmse:6.54960                                                      
[5]	validation-rmse:6.29624                                                      
[6]	validation-rmse:6.09628                                                      
[7]	validation-rmse:5.93791                                                      
[8]	validation-rmse:5.81175                                                      
[9]	validation-rmse:5.71487                                                      
[10]	validation-rmse:5.63947                                                     
[11]	validation-rmse:5.57962                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.81609                                                      
[1]	validation-rmse:8.54752                                                      
[2]	validation-rmse:8.29768                                                      
[3]	validation-rmse:8.06706                                                      
[4]	validation-rmse:7.85466                                                      
[5]	validation-rmse:7.65713                                                      
[6]	validation-rmse:7.47390                                                      
[7]	validation-rmse:7.30809                                                      
[8]	validation-rmse:7.15233                                                      
[9]	validation-rmse:7.01243                                                      
[10]	validation-rmse:6.88150                                                     
[11]	validation-rmse:6.76433                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.69425                                                      
[1]	validation-rmse:8.32440                                                      
[2]	validation-rmse:7.99375                                                      
[3]	validation-rmse:7.70081                                                      
[4]	validation-rmse:7.43756                                                      
[5]	validation-rmse:7.20482                                                      
[6]	validation-rmse:6.99849                                                      
[7]	validation-rmse:6.81530                                                      
[8]	validation-rmse:6.65131                                                      
[9]	validation-rmse:6.51293                                                      
[10]	validation-rmse:6.39100                                                     
[11]	validation-rmse:6.28041                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.23643                                                      
[1]	validation-rmse:6.26884                                                      
[2]	validation-rmse:5.79235                                                      
[3]	validation-rmse:5.57151                                                      
[4]	validation-rmse:5.46176                                                      
[5]	validation-rmse:5.41047                                                      
[6]	validation-rmse:5.38068                                                      
[7]	validation-rmse:5.36778                                                      
[8]	validation-rmse:5.35698                                                      
[9]	validation-rmse:5.35207                                                      
[10]	validation-rmse:5.34881                                                     
[11]	validation-rmse:5.34715                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.71665                                                      
[1]	validation-rmse:8.36401                                                      
[2]	validation-rmse:8.04503                                                      
[3]	validation-rmse:7.75799                                                      
[4]	validation-rmse:7.50008                                                      
[5]	validation-rmse:7.26884                                                      
[6]	validation-rmse:7.06202                                                      
[7]	validation-rmse:6.87732                                                      
[8]	validation-rmse:6.71275                                                      
[9]	validation-rmse:6.56640                                                      
[10]	validation-rmse:6.43666                                                     
[11]	validation-rmse:6.32101                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[7]	validation-rmse:7.47581                                                      
[8]	validation-rmse:7.33405                                                      
[9]	validation-rmse:7.20363                                                      
[10]	validation-rmse:7.08184                                                     
[11]	validation-rmse:6.97062                                                     
[12]	validation-rmse:6.86825                                                     
[13]	validation-rmse:6.77343                                                     
[14]	validation-rmse:6.68631                                                     
[15]	validation-rmse:6.60611                                                     
[16]	validation-rmse:6.53248                                                     
[17]	validation-rmse:6.46474                                                     
[18]	validation-rmse:6.40145                                                     
[19]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.76915                                                      
[1]	validation-rmse:6.90271                                                      
[2]	validation-rmse:6.36049                                                      
[3]	validation-rmse:6.02899                                                      
[4]	validation-rmse:5.82804                                                      
[5]	validation-rmse:5.70645                                                      
[6]	validation-rmse:5.63134                                                      
[7]	validation-rmse:5.58279                                                      
[8]	validation-rmse:5.55380                                                      
[9]	validation-rmse:5.53313                                                      
[10]	validation-rmse:5.51732                                                     
[11]	validation-rmse:5.50632                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:5.64954                                                      
[3]	validation-rmse:5.60493                                                      
[4]	validation-rmse:5.59219                                                      
[5]	validation-rmse:5.58145                                                      
[6]	validation-rmse:5.57526                                                      
[7]	validation-rmse:5.56985                                                      
[8]	validation-rmse:5.56487                                                      
[9]	validation-rmse:5.55803                                                      
[10]	validation-rmse:5.55520                                                     
[11]	validation-rmse:5.55259                                                     
[12]	validation-rmse:5.54847                                                     
[13]	validation-rmse:5.54396                                                     
[14]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.50259                                                      
[1]	validation-rmse:7.99144                                                      
[2]	validation-rmse:7.56103                                                      
[3]	validation-rmse:7.19832                                                      
[4]	validation-rmse:6.89449                                                      
[5]	validation-rmse:6.64023                                                      
[6]	validation-rmse:6.43121                                                      
[7]	validation-rmse:6.25641                                                      
[8]	validation-rmse:6.11110                                                      
[9]	validation-rmse:5.99187                                                      
[10]	validation-rmse:5.89422                                                     
[11]	validation-rmse:5.81042                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.62308                                                      
[1]	validation-rmse:8.19646                                                      
[2]	validation-rmse:7.82092                                                      
[3]	validation-rmse:7.49198                                                      
[4]	validation-rmse:7.20457                                                      
[5]	validation-rmse:6.95450                                                      
[6]	validation-rmse:6.73659                                                      
[7]	validation-rmse:6.54796                                                      
[8]	validation-rmse:6.38528                                                      
[9]	validation-rmse:6.24473                                                      
[10]	validation-rmse:6.12435                                                     
[11]	validation-rmse:6.02095                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.54176                                                      
[1]	validation-rmse:8.05316                                                      
[2]	validation-rmse:7.63277                                                      
[3]	validation-rmse:7.27317                                                      
[4]	validation-rmse:6.96595                                                      
[5]	validation-rmse:6.70510                                                      
[6]	validation-rmse:6.48509                                                      
[7]	validation-rmse:6.30049                                                      
[8]	validation-rmse:6.14512                                                      
[9]	validation-rmse:6.01394                                                      
[10]	validation-rmse:5.90395                                                     
[11]	validation-rmse:5.81367                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.14478                                                      
[1]	validation-rmse:6.20776                                                      
[2]	validation-rmse:5.77404                                                      
[3]	validation-rmse:5.57759                                                      
[4]	validation-rmse:5.49129                                                      
[5]	validation-rmse:5.44133                                                      
[6]	validation-rmse:5.41195                                                      
[7]	validation-rmse:5.39600                                                      
[8]	validation-rmse:5.38755                                                      
[9]	validation-rmse:5.38282                                                      
[10]	validation-rmse:5.37853                                                     
[11]	validation-rmse:5.37529                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.40797                                                      
[1]	validation-rmse:7.82870                                                      
[2]	validation-rmse:7.35172                                                      
[3]	validation-rmse:6.96238                                                      
[4]	validation-rmse:6.64546                                                      
[5]	validation-rmse:6.39113                                                      
[6]	validation-rmse:6.18501                                                      
[7]	validation-rmse:6.02127                                                      
[8]	validation-rmse:5.88912                                                      
[9]	validation-rmse:5.78539                                                      
[10]	validation-rmse:5.69887                                                     
[11]	validation-rmse:5.63214                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.08889                                                      
[1]	validation-rmse:7.33484                                                      
[2]	validation-rmse:6.78196                                                      
[3]	validation-rmse:6.38690                                                      
[4]	validation-rmse:6.10251                                                      
[5]	validation-rmse:5.89743                                                      
[6]	validation-rmse:5.75533                                                      
[7]	validation-rmse:5.65237                                                      
[8]	validation-rmse:5.58261                                                      
[9]	validation-rmse:5.53153                                                      
[10]	validation-rmse:5.49387                                                     
[11]	validation-rmse:5.46560                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.25228                                                      
[1]	validation-rmse:6.29331                                                      
[2]	validation-rmse:5.82524                                                      
[3]	validation-rmse:5.60467                                                      
[4]	validation-rmse:5.50120                                                      
[5]	validation-rmse:5.44910                                                      
[6]	validation-rmse:5.42166                                                      
[7]	validation-rmse:5.40471                                                      
[8]	validation-rmse:5.39284                                                      
[9]	validation-rmse:5.38550                                                      
[10]	validation-rmse:5.37988                                                     
[11]	validation-rmse:5.37630                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.16816                                                      
[1]	validation-rmse:7.45801                                                      
[2]	validation-rmse:6.92788                                                      
[3]	validation-rmse:6.53873                                                      
[4]	validation-rmse:6.25542                                                      
[5]	validation-rmse:6.05098                                                      
[6]	validation-rmse:5.90427                                                      
[7]	validation-rmse:5.80033                                                      
[8]	validation-rmse:5.72434                                                      
[9]	validation-rmse:5.66882                                                      
[10]	validation-rmse:5.62661                                                     
[11]	validation-rmse:5.59428                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[9]	validation-rmse:5.71634                                                      
[10]	validation-rmse:5.70588                                                     
[11]	validation-rmse:5.69171                                                     
[12]	validation-rmse:5.68145                                                     
[13]	validation-rmse:5.67286                                                     
[14]	validation-rmse:5.66849                                                     
[15]	validation-rmse:5.66648                                                     
[16]	validation-rmse:5.66277                                                     
[17]	validation-rmse:5.66068                                                     
[18]	validation-rmse:5.65866                                                     
[19]	validation-rmse:5.65646                                                     
[20]	validation-rmse:5.65443                                                     
[21]	validation-

  self.starting_round = model.num_boosted_rounds()



[9]	validation-rmse:5.70824                                                      
[10]	validation-rmse:5.69828                                                     
[11]	validation-rmse:5.68554                                                     
[12]	validation-rmse:5.67817                                                     
[13]	validation-rmse:5.67309                                                     
[14]	validation-rmse:5.66744                                                     
[15]	validation-rmse:5.66517                                                     
[16]	validation-rmse:5.66224                                                     
[17]	validation-rmse:5.65926                                                     
[18]	validation-rmse:5.65521                                                     
[19]	validation-rmse:5.65017                                                     
[20]	validation-rmse:5.64798                                                     
[21]	validation-

  self.starting_round = model.num_boosted_rounds()



[6]	validation-rmse:5.58198                                                      
[7]	validation-rmse:5.57670                                                      
[8]	validation-rmse:5.57266                                                      
[9]	validation-rmse:5.56964                                                      
[10]	validation-rmse:5.56163                                                     
[11]	validation-rmse:5.55654                                                     
[12]	validation-rmse:5.55084                                                     
[13]	validation-rmse:5.54595                                                     
[14]	validation-rmse:5.54270                                                     
[15]	validation-rmse:5.53773                                                     
[16]	validation-rmse:5.53378                                                     
[17]	validation-rmse:5.53069                                                     
[18]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.67544                                                      
[1]	validation-rmse:5.83579                                                      
[2]	validation-rmse:5.56019                                                      
[3]	validation-rmse:5.47155                                                      
[4]	validation-rmse:5.43764                                                      
[5]	validation-rmse:5.42009                                                      
[6]	validation-rmse:5.41369                                                      
[7]	validation-rmse:5.40807                                                      
[8]	validation-rmse:5.40326                                                      
[9]	validation-rmse:5.38876                                                      
[10]	validation-rmse:5.38803                                                     
[11]	validation-rmse:5.38298                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[8]	validation-rmse:5.59544                                                      
[9]	validation-rmse:5.59153                                                      
[10]	validation-rmse:5.58630                                                     
[11]	validation-rmse:5.58132                                                     
[12]	validation-rmse:5.57836                                                     
[13]	validation-rmse:5.57076                                                     
[14]	validation-rmse:5.56611                                                     
[15]	validation-rmse:5.56197                                                     
[16]	validation-rmse:5.55687                                                     
[17]	validation-rmse:5.55304                                                     
[18]	validation-rmse:5.54633                                                     
[19]	validation-rmse:5.54494                                                     
[20]	validation-

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:5.58759                                                      
[2]	validation-rmse:5.56931                                                      
[3]	validation-rmse:5.56358                                                      
[4]	validation-rmse:5.56254                                                      
[5]	validation-rmse:5.55919                                                      
[6]	validation-rmse:5.55312                                                      
[7]	validation-rmse:5.54690                                                      
[8]	validation-rmse:5.54068                                                      
[9]	validation-rmse:5.53726                                                      
[10]	validation-rmse:5.53169                                                     
[11]	validation-rmse:5.53241                                                     
[12]	validation-rmse:5.53070                                                     
[13]	validation-

  self.starting_round = model.num_boosted_rounds()



[10]	validation-rmse:5.65853                                                     
[11]	validation-rmse:5.65336                                                     
[12]	validation-rmse:5.64771                                                     
[13]	validation-rmse:5.64466                                                     
[14]	validation-rmse:5.63856                                                     
[15]	validation-rmse:5.63644                                                     
[16]	validation-rmse:5.63361                                                     
[17]	validation-rmse:5.62962                                                     
[18]	validation-rmse:5.62475                                                     
[19]	validation-rmse:5.62084                                                     
[20]	validation-rmse:5.61719                                                     
[21]	validation-rmse:5.61487                                                     
[22]	validation-

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:6.60494                                                      
[2]	validation-rmse:6.10067                                                      
[3]	validation-rmse:5.83388                                                      
[4]	validation-rmse:5.68648                                                      
[5]	validation-rmse:5.60916                                                      
[6]	validation-rmse:5.56588                                                      
[7]	validation-rmse:5.53789                                                      
[8]	validation-rmse:5.52103                                                      
[9]	validation-rmse:5.50831                                                      
[10]	validation-rmse:5.49750                                                     
[11]	validation-rmse:5.48982                                                     
[12]	validation-rmse:5.48218                                                     
[13]	validation-

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:5.60686                                                      
[3]	validation-rmse:5.58166                                                      
[4]	validation-rmse:5.57610                                                      
[5]	validation-rmse:5.56318                                                      
[6]	validation-rmse:5.55600                                                      
[7]	validation-rmse:5.54879                                                      
[8]	validation-rmse:5.54385                                                      
[9]	validation-rmse:5.53886                                                      
[10]	validation-rmse:5.52529                                                     
[11]	validation-rmse:5.51885                                                     
[12]	validation-rmse:5.51666                                                     
[13]	validation-rmse:5.51158                                                     
[14]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.92177                                                      
[1]	validation-rmse:7.11449                                                      
[2]	validation-rmse:6.56086                                                      
[3]	validation-rmse:6.20394                                                      
[4]	validation-rmse:5.96332                                                      
[5]	validation-rmse:5.81915                                                      
[6]	validation-rmse:5.70798                                                      
[7]	validation-rmse:5.64476                                                      
[8]	validation-rmse:5.59595                                                      
[9]	validation-rmse:5.56180                                                      
[10]	validation-rmse:5.53847                                                     
[11]	validation-rmse:5.51595                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.92880                                                      
[1]	validation-rmse:5.99517                                                      
[2]	validation-rmse:5.62952                                                      
[3]	validation-rmse:5.49550                                                      
[4]	validation-rmse:5.44004                                                      
[5]	validation-rmse:5.41575                                                      
[6]	validation-rmse:5.40296                                                      
[7]	validation-rmse:5.39274                                                      
[8]	validation-rmse:5.38837                                                      
[9]	validation-rmse:5.38735                                                      
[10]	validation-rmse:5.37743                                                     
[11]	validation-rmse:5.37321                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.00936                                                      
[1]	validation-rmse:7.22735                                                      
[2]	validation-rmse:6.68240                                                      
[3]	validation-rmse:6.31113                                                      
[4]	validation-rmse:6.05900                                                      
[5]	validation-rmse:5.89117                                                      
[6]	validation-rmse:5.77885                                                      
[7]	validation-rmse:5.70252                                                      
[8]	validation-rmse:5.64659                                                      
[9]	validation-rmse:5.60496                                                      
[10]	validation-rmse:5.57781                                                     
[11]	validation-rmse:5.55607                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.52067                                                      
[1]	validation-rmse:5.46271                                                      
[2]	validation-rmse:5.44232                                                      
[3]	validation-rmse:5.43453                                                      
[4]	validation-rmse:5.42351                                                      
[5]	validation-rmse:5.41804                                                      
[6]	validation-rmse:5.41517                                                      
[7]	validation-rmse:5.41051                                                      
[8]	validation-rmse:5.39707                                                      
[9]	validation-rmse:5.39720                                                      
[10]	validation-rmse:5.39660                                                     
[11]	validation-rmse:5.39671                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.65604                                                      
[1]	validation-rmse:6.73735                                                      
[2]	validation-rmse:6.17903                                                      
[3]	validation-rmse:5.83993                                                      
[4]	validation-rmse:5.64602                                                      
[5]	validation-rmse:5.53081                                                      
[6]	validation-rmse:5.46379                                                      
[7]	validation-rmse:5.42388                                                      
[8]	validation-rmse:5.39544                                                      
[9]	validation-rmse:5.37871                                                      
[10]	validation-rmse:5.36545                                                     
[11]	validation-rmse:5.35552                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:5.66804                                                      
[3]	validation-rmse:5.59668                                                      
[4]	validation-rmse:5.56902                                                      
[5]	validation-rmse:5.55089                                                      
[6]	validation-rmse:5.54553                                                      
[7]	validation-rmse:5.53905                                                      
[8]	validation-rmse:5.53017                                                      
[9]	validation-rmse:5.52547                                                      
[10]	validation-rmse:5.52222                                                     
[11]	validation-rmse:5.51715                                                     
[12]	validation-rmse:5.51518                                                     
[13]	validation-rmse:5.51342                                                     
[14]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.30682                                                      
[1]	validation-rmse:7.66648                                                      
[2]	validation-rmse:7.15526                                                      
[3]	validation-rmse:6.75239                                                      
[4]	validation-rmse:6.43831                                                      
[5]	validation-rmse:6.18951                                                      
[6]	validation-rmse:5.99874                                                      
[7]	validation-rmse:5.85025                                                      
[8]	validation-rmse:5.73647                                                      
[9]	validation-rmse:5.64881                                                      
[10]	validation-rmse:5.58234                                                     
[11]	validation-rmse:5.53090                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.02930                                                      
[1]	validation-rmse:5.64772                                                      
[2]	validation-rmse:5.57807                                                      
[3]	validation-rmse:5.56310                                                      
[4]	validation-rmse:5.56439                                                      
[5]	validation-rmse:5.56307                                                      
[6]	validation-rmse:5.56298                                                      
[7]	validation-rmse:5.55732                                                      
[8]	validation-rmse:5.55415                                                      
[9]	validation-rmse:5.55149                                                      
[10]	validation-rmse:5.54772                                                     
[11]	validation-rmse:5.54857                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.49655                                                      
[1]	validation-rmse:6.55966                                                      
[2]	validation-rmse:6.03225                                                      
[3]	validation-rmse:5.74724                                                      
[4]	validation-rmse:5.58559                                                      
[5]	validation-rmse:5.50002                                                      
[6]	validation-rmse:5.44764                                                      
[7]	validation-rmse:5.41895                                                      
[8]	validation-rmse:5.39826                                                      
[9]	validation-rmse:5.38550                                                      
[10]	validation-rmse:5.37869                                                     
[11]	validation-rmse:5.37546                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.97721                                                      
[1]	validation-rmse:6.07962                                                      
[2]	validation-rmse:5.72460                                                      
[3]	validation-rmse:5.58751                                                      
[4]	validation-rmse:5.52678                                                      
[5]	validation-rmse:5.49920                                                      
[6]	validation-rmse:5.48191                                                      
[7]	validation-rmse:5.47245                                                      
[8]	validation-rmse:5.46488                                                      
[9]	validation-rmse:5.45977                                                      
[10]	validation-rmse:5.43801                                                     
[11]	validation-rmse:5.43638                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.75498                                                      
[1]	validation-rmse:5.58055                                                      
[2]	validation-rmse:5.57356                                                      
[3]	validation-rmse:5.57065                                                      
[4]	validation-rmse:5.55884                                                      
[5]	validation-rmse:5.55850                                                      
[6]	validation-rmse:5.55064                                                      
[7]	validation-rmse:5.55412                                                      
[8]	validation-rmse:5.54219                                                      
[9]	validation-rmse:5.54292                                                      
[10]	validation-rmse:5.54297                                                     
[11]	validation-rmse:5.54526                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.22835                                                      
[1]	validation-rmse:7.53908                                                      
[2]	validation-rmse:7.00722                                                      
[3]	validation-rmse:6.59668                                                      
[4]	validation-rmse:6.28883                                                      
[5]	validation-rmse:6.05546                                                      
[6]	validation-rmse:5.87785                                                      
[7]	validation-rmse:5.74966                                                      
[8]	validation-rmse:5.65002                                                      
[9]	validation-rmse:5.58115                                                      
[10]	validation-rmse:5.52802                                                     
[11]	validation-rmse:5.48583                                                     
[12]	validation-

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.21220                                                      
[1]	validation-rmse:5.58431                                                      
[2]	validation-rmse:5.45106                                                      
[3]	validation-rmse:5.41573                                                      
[4]	validation-rmse:5.39142                                                      
[5]	validation-rmse:5.38298                                                      
[6]	validation-rmse:5.37776                                                      
[7]	validation-rmse:5.37356                                                      
[8]	validation-rmse:5.37124                                                      
[9]	validation-rmse:5.36899                                                      
[10]	validation-rmse:5.36021                                                     
[11]	validation-rmse:5.36117                                                     
[12]	validation-

In [24]:
mlflow.xgboost.autolog(disable=True)

In [None]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:8.56745
[1]	validation-rmse:8.10033
[2]	validation-rmse:7.69739
[3]	validation-rmse:7.35105
[4]	validation-rmse:7.05239
[5]	validation-rmse:6.80020
[6]	validation-rmse:6.58162
[7]	validation-rmse:6.39726
[8]	validation-rmse:6.24197
[9]	validation-rmse:6.11059
[10]	validation-rmse:6.00193
[11]	validation-rmse:5.90700
[12]	validation-rmse:5.82752
[13]	validation-rmse:5.76123
[14]	validation-rmse:5.70306
[15]	validation-rmse:5.65560
[16]	validation-rmse:5.61619
[17]	validation-rmse:5.58065
[18]	validation-rmse:5.55157
[19]	validation-rmse:5.52734
[20]	validation-rmse:5.50858
[21]	validation-rmse:5.48872
[22]	validation-rmse:5.47379
[23]	validation-rmse:5.45957
[24]	validation-rmse:5.44705
[25]	validation-rmse:5.43706
[26]	validation-rmse:5.42713
[27]	validation-rmse:5.41891
[28]	validation-rmse:5.41283
[29]	validation-rmse:5.40819
[30]	validation-rmse:5.40244
[31]	validation-rmse:5.39854
[32]	validation-rmse:5.39446
[33]	validation-rmse:5.39004
[34]	validation-rmse:5.3

[103]	validation-rmse:5.34937
[104]	validation-rmse:5.34942
[105]	validation-rmse:5.34926
[106]	validation-rmse:5.34906
[107]	validation-rmse:5.34844
[108]	validation-rmse:5.34842
[109]	validation-rmse:5.34862
[110]	validation-rmse:5.34826
[111]	validation-rmse:5.34813
[112]	validation-rmse:5.34771
[113]	validation-rmse:5.34770
[114]	validation-rmse:5.34757
[115]	validation-rmse:5.34715
[116]	validation-rmse:5.34679
[117]	validation-rmse:5.34671
[118]	validation-rmse:5.34628
[119]	validation-rmse:5.34616
[120]	validation-rmse:5.34638
[121]	validation-rmse:5.34591
[122]	validation-rmse:5.34571
[123]	validation-rmse:5.34564
[124]	validation-rmse:5.34567
[125]	validation-rmse:5.34552
[126]	validation-rmse:5.34560
[127]	validation-rmse:5.34526
[128]	validation-rmse:5.34533
[129]	validation-rmse:5.34547
[130]	validation-rmse:5.34589
[131]	validation-rmse:5.34586
[132]	validation-rmse:5.34585
[133]	validation-rmse:5.34582
[134]	validation-rmse:5.34554
[135]	validation-rmse:5.34515
[136]	vali

  xgb_model.save_model(model_data_path)


: 

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        