In [10]:
!python -V

Python 3.13.2


In [11]:
import pandas as pd

In [12]:
import pickle

In [13]:
import xgboost as xgb

In [14]:
#import seaborn as sns
#import matplotlib.pyplot as plt
#import numpy as np

In [15]:
from sklearn.feature_extraction import DictVectorizer
#from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import Lasso
#from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

In [16]:
import mlflow


In [17]:
mlflow.set_tracking_uri("http://localhost:5000")


In [18]:
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/989498194840554431', creation_time=1748702153340, experiment_id='989498194840554431', last_update_time=1748702153340, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [19]:
def read_dataframe(filename):
    
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    
    return df

In [20]:
#import xgboost as xgb
#from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from hyperopt.pyll import scope

In [21]:
#mlflow.set_tracking_uri("sqlite:///mlflow1.db")
#mlflow.set_experiment("nyc-taxi-experiment_1")

In [None]:
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet')
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet')


In [23]:
#df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
#df_val['duration'] = (df_val['lpep_dropoff_datetime'] - df_val['lpep_pickup_datetime']).dt.total_seconds() / 60
#df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

In [24]:
#df = pd.read_parquet('./data/green_tripdata_2021-01.parquet')

#df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

#df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
#df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

#df = df[(df.duration >= 1) & (df.duration <= 60)]

#df["weekday"] = df.lpep_pickup_datetime.dt.weekday
#df["hour"] =  df.lpep_pickup_datetime.dt.hour

#categorical = ['PULocationID', 'DOLocationID',  'weekday', 'hour']
#numerical = ['trip_distance']

#df[categorical] = df[categorical].astype(str)

In [25]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

In [26]:
dv = DictVectorizer()
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [27]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [28]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [33]:
from pathlib import Path
model_path = Path("models")
model_path.mkdir(exist_ok=True)

In [34]:
with mlflow.start_run():

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'max_depth': 30,
        'learning_rate': 0.09585,
        'reg_lambda': 0.011074980286498087,
        'reg_alpha': 0.018788520719314586,
        'min_child_weight': 1.06,
        'objective': 'reg:linear',
        'seed': 42
    }

    mlflow.log_params(best_params)
    model = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=30,
        evals=[(valid, 'valid')],
        early_stopping_rounds=50       
    )

    y_pred = model.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(model, artifact_path="models_mlflow")


  self.starting_round = model.num_boosted_rounds()


[0]	valid-rmse:11.44485
[1]	valid-rmse:10.77207
[2]	valid-rmse:10.18367
[3]	valid-rmse:9.67408
[4]	valid-rmse:9.23177
[5]	valid-rmse:8.84820
[6]	valid-rmse:8.51888
[7]	valid-rmse:8.23602
[8]	valid-rmse:7.99331
[9]	valid-rmse:7.78717
[10]	valid-rmse:7.61043
[11]	valid-rmse:7.45941
[12]	valid-rmse:7.33034
[13]	valid-rmse:7.22036
[14]	valid-rmse:7.12731
[15]	valid-rmse:7.04780
[16]	valid-rmse:6.97979
[17]	valid-rmse:6.92262
[18]	valid-rmse:6.87250
[19]	valid-rmse:6.82886
[20]	valid-rmse:6.79104
[21]	valid-rmse:6.75956
[22]	valid-rmse:6.73145
[23]	valid-rmse:6.70684
[24]	valid-rmse:6.68504
[25]	valid-rmse:6.66615
[26]	valid-rmse:6.64891
[27]	valid-rmse:6.63431
[28]	valid-rmse:6.62083
[29]	valid-rmse:6.60913


  xgb_model.save_model(model_data_path)


🏃 View run fearless-whale-258 at: http://localhost:5000/#/experiments/989498194840554431/runs/56c58a7a78904f7cafeae9b89b52a671
🧪 View experiment at: http://localhost:5000/#/experiments/989498194840554431
