In [1]:
# open this notebook with `uv run --with jupyter --with cuml-cu12 jupyter notebook` to make it work
# This requires the nvidia cuda toolkit to be installed
try:
    %load_ext cuml.accel
except ImportError:
    print("cuml.accel extension not available. Using CPU instead.")
except Exception as e:
    print(f"An error occurred while loading cuml.accel: {e}")

An error occurred while loading cuml.accel: Failed to dlopen libcudart.so.12


In [2]:
from pathlib import Path

import polars as pl

from e2e_taxi_ride_duration_prediction.ingestion import get_nyc_taxi_data

SEED = 42
# Set streaming as standard engine to avoid oom
pl.Config.set_engine_affinity("streaming")

if Path().resolve().name == "notebooks":
    PROJECT_ROOT = Path("../").resolve()
else:
    PROJECT_ROOT = Path().resolve()

start = (2025, 1)
end = (2025, 3)

lf = get_nyc_taxi_data(start=start, end=end, root=PROJECT_ROOT)

In [3]:
lf.head().collect()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
i32,datetime[μs],datetime[μs],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2007-12-05 18:45:00,2007-12-05 19:02:00,1,3.0,1,"""N""",142,234,2,17.0,1.0,0.5,0.0,0.0,1.0,22.75,2.5,0.0,0.75
2,2009-01-01 00:19:34,2009-01-01 01:10:21,6,10.77,1,"""N""",138,239,2,52.7,5.0,0.5,0.0,6.94,1.0,70.39,2.5,1.75,0.0
2,2024-12-31 20:47:55,2024-12-31 20:54:00,2,1.72,1,"""N""",48,246,1,9.3,1.0,0.5,2.86,0.0,1.0,17.16,2.5,0.0,0.0
2,2024-12-31 20:52:50,2024-12-31 21:09:34,2,5.05,1,"""N""",249,262,1,23.3,1.0,0.5,4.0,0.0,1.0,32.3,2.5,0.0,0.0
2,2024-12-31 20:54:50,2024-12-31 21:30:18,2,1.39,1,"""N""",246,48,1,28.2,1.0,0.5,6.64,0.0,1.0,39.84,2.5,0.0,0.0


In [4]:
lf = lf.with_columns(
    (
        (
            pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")
        ).dt.total_seconds()
        / 60
    ).alias("duration")
)

In [5]:
lf.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

statistic,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,duration
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1.1198026e7,"""11198026""","""11198026""",8.934277e6,1.1198026e7,8.934277e6,"""8934277""",1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,1.1198026e7,8.934277e6,8.934277e6,1.1198026e7,1.1198026e7
"""null_count""",0.0,"""0""","""0""",2.263749e6,0.0,2.263749e6,"""2263749""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.263749e6,2.263749e6,0.0,0.0
"""mean""",1.803277,"""2025-02-17 03:02:19.286455""","""2025-02-17 03:17:49.412614""",1.288654,6.179361,2.453865,,163.314009,162.527806,0.976855,17.241468,1.256459,0.479046,2.849041,0.444748,0.957046,25.668077,2.225126,0.127256,0.520258,15.502103
"""std""",0.482393,,,0.735222,581.823197,11.513329,,65.414264,69.830883,0.723082,261.991719,1.851653,0.134748,3.76522,2.005737,0.271624,262.310249,0.905474,0.481509,0.358708,31.075321
"""min""",1.0,"""2007-12-05 18:45:00""","""2007-12-05 19:02:00""",0.0,0.0,1.0,"""N""",1.0,1.0,0.0,-1807.6,-9.25,-0.5,-220.0,-142.17,-1.0,-1832.85,-2.5,-1.75,-0.75,-51472.316667
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""50%""",2.0,"""2025-02-17 14:56:00""","""2025-02-17 15:11:05""",1.0,1.72,1.0,,161.0,162.0,1.0,12.8,0.0,0.5,2.25,0.0,1.0,20.25,2.5,0.0,0.75,12.133333
"""75%""",2.0,"""2025-03-11 08:14:55""","""2025-03-11 08:32:07""",1.0,3.24,1.0,,233.0,234.0,1.0,20.5,2.5,0.5,3.9,0.0,1.0,28.42,2.5,0.0,0.75,18.95
"""95%""",2.0,"""2025-03-27 22:47:43""","""2025-03-27 23:02:39""",3.0,11.63,2.0,,249.0,255.0,2.0,51.3,5.0,0.5,10.0,6.94,1.0,73.14,2.5,1.75,0.75,37.183333
"""99%""",2.0,"""2025-03-31 01:13:47""","""2025-03-31 01:32:07""",4.0,19.34,99.0,,263.0,263.0,4.0,71.3,7.5,0.5,17.05,6.94,1.0,102.11,2.5,1.75,0.75,61.483333


In [6]:
lf = lf.filter((pl.col("duration") > 0) & (pl.col("duration") < 60))

In [7]:
lf.collect_schema()

Schema([('VendorID', Int32),
        ('tpep_pickup_datetime', Datetime(time_unit='us', time_zone=None)),
        ('tpep_dropoff_datetime', Datetime(time_unit='us', time_zone=None)),
        ('passenger_count', Int64),
        ('trip_distance', Float64),
        ('RatecodeID', Int64),
        ('store_and_fwd_flag', String),
        ('PULocationID', Int32),
        ('DOLocationID', Int32),
        ('payment_type', Int64),
        ('fare_amount', Float64),
        ('extra', Float64),
        ('mta_tax', Float64),
        ('tip_amount', Float64),
        ('tolls_amount', Float64),
        ('improvement_surcharge', Float64),
        ('total_amount', Float64),
        ('congestion_surcharge', Float64),
        ('Airport_fee', Float64),
        ('cbd_congestion_fee', Float64),
        ('duration', Float64)])

In [8]:
categorical_columns = [
    "VendorID",
    "RatecodeID",
    "store_and_fwd_flag",
    "PULocationID",
    "DOLocationID",
    "payment_type",
]

# Cast categorical columns to string
for col in categorical_columns:
    lf = lf.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical))  # type: ignore

In [9]:
lf = lf.with_columns(
    (pl.col("PULocationID") + "_" + pl.col("DOLocationID"))
    .cast(pl.Categorical)
    .alias("pickup_dropoff_pair")
)

In [10]:
lf.head().collect()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,duration,pickup_dropoff_pair
cat,datetime[μs],datetime[μs],i64,f64,cat,cat,cat,cat,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat
"""2""",2007-12-05 18:45:00,2007-12-05 19:02:00,1,3.0,"""1""","""N""","""142""","""234""","""2""",17.0,1.0,0.5,0.0,0.0,1.0,22.75,2.5,0.0,0.75,17.0,"""142_234"""
"""2""",2009-01-01 00:19:34,2009-01-01 01:10:21,6,10.77,"""1""","""N""","""138""","""239""","""2""",52.7,5.0,0.5,0.0,6.94,1.0,70.39,2.5,1.75,0.0,50.783333,"""138_239"""
"""2""",2024-12-31 20:47:55,2024-12-31 20:54:00,2,1.72,"""1""","""N""","""48""","""246""","""1""",9.3,1.0,0.5,2.86,0.0,1.0,17.16,2.5,0.0,0.0,6.083333,"""48_246"""
"""2""",2024-12-31 20:52:50,2024-12-31 21:09:34,2,5.05,"""1""","""N""","""249""","""262""","""1""",23.3,1.0,0.5,4.0,0.0,1.0,32.3,2.5,0.0,0.0,16.733333,"""249_262"""
"""2""",2024-12-31 20:54:50,2024-12-31 21:30:18,2,1.39,"""1""","""N""","""246""","""48""","""1""",28.2,1.0,0.5,6.64,0.0,1.0,39.84,2.5,0.0,0.0,35.466667,"""246_48"""


In [11]:
# Add filter at lower and upper bounds since data includes data points outside range
lf_train = lf.filter(
    (pl.col("tpep_pickup_datetime") >= pl.datetime(2025, 1, 1))
    & (pl.col("tpep_pickup_datetime") < pl.datetime(2025, 3, 1))
)
lf_test = lf.filter(
    (pl.col("tpep_pickup_datetime") >= pl.datetime(2025, 3, 1))
    & (pl.col("tpep_pickup_datetime") < pl.datetime(2025, 4, 1))
)

In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    root_mean_squared_error,
)

baseline_features = ["pickup_dropoff_pair", "trip_distance"]
target = "duration"

dict_vectorizer = DictVectorizer()

train_dicts = lf_train.select(baseline_features).collect().to_dicts()
X_train = dict_vectorizer.fit_transform(train_dicts)

test_dicts = lf_test.select(baseline_features).collect().to_dicts()
X_test = dict_vectorizer.transform(test_dicts)

y_train = lf_train.select(target).collect().to_numpy().ravel()
y_test = lf_test.select(target).collect().to_numpy().ravel()

In [14]:
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f} minutes")
print(f"RMSE: {rmse:.2f} minutes")
print(f"R2: {r2:.2f}")

MAE: 3.88 minutes
RMSE: 5.54 minutes
R2: 0.70


In [16]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(device="cuda", seed=SEED)
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)
rmse_xgb = root_mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGB MAE: {mae_xgb:.2f} minutes")
print(f"XGB RMSE: {rmse_xgb:.2f} minutes")
print(f"XGB R2: {r2_xgb:.2f}")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGB MAE: 3.71 minutes
XGB RMSE: 5.38 minutes
XGB R2: 0.72
