In [10]:
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go

import plotly.io as pio
import uber_style as ub


pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"
# --- CONFIGURATION ---
# Path to a RAW file (e.g., Jan 2024)
PATH_RAW_FILE = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025\fhvhv_tripdata_2024-01.parquet"

# Path to a PROCESSED Sample (Your 1% sample file)
PATH_PROC_FILE = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Samples\tlc_sample_2024_processed.parquet"

MODEL_SEED = 105
SAMPLE_SIZE = 100_000  # Keep it fast for demo

# raw_file = pl.scan_parquet(PATH_RAW_FILE)
# raw_file.collect_schema().names()

# sample_processed_file = pl.scan_parquet(PATH_PROC_FILE)
# print(len(sample_processed_file.collect_schema().names()))
# print(sample_processed_file.collect_schema().names())


# Hi

In [12]:
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.io as pio
import uber_style as ub


pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"

# --- CONFIGURATION ---
# Point to a RAW file (Not processed)
RAW_FILE_PATH = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025\fhvhv_tripdata_2024-01.parquet"
SAMPLE_SIZE = 1_000_000
SEED = 105



In [4]:
# --- 1. DATA LOADING (RAW) ---
print("ðŸ“¦ Loading Raw Data...")
lf_raw = pl.scan_parquet(RAW_FILE_PATH)

# Feature Selection (Available BEFORE trip)
# We strictly exclude trip_miles/time, tolls, tips, etc.
raw_features = [
    "hvfhs_license_num",
    "dispatching_base_num",  # High cardinality, but available
    "PULocationID",
    "DOLocationID",
    "pickup_datetime",  # Need to extract components
    "wav_request_flag",
    "shared_request_flag",
]
target = "base_passenger_fare"

# Lazy Processing to prepare features
df_raw = (
    lf_raw.select(raw_features + [target])
    .filter(pl.col(target) > 0)  # Basic validity
    .collect()
    .sample(n=SAMPLE_SIZE, seed=SEED)
    .with_columns([
        # Basic Extraction (The limit of "Raw" analysis)
        pl.col("pickup_datetime").dt.hour().alias("hour"),
        pl.col("pickup_datetime").dt.weekday().alias("dow"),
        # Cast Flags
        pl.col("wav_request_flag").cast(pl.String).cast(pl.Categorical),
        pl.col("shared_request_flag").cast(pl.String).cast(pl.Categorical),
        pl.col("hvfhs_license_num").cast(pl.Categorical),
        pl.col("dispatching_base_num").cast(pl.Categorical),
        pl.col("PULocationID").cast(pl.Categorical),  # Treat ID as Cat
        pl.col("DOLocationID").cast(pl.Categorical),
    ])
    .drop("pickup_datetime")  # Drop original timestamp
    .to_pandas()
)

# --- 2. TRAINING (RAW) ---
print("ðŸš€ Training Raw Model (LightGBM)...")

X_raw = df_raw.drop(columns=[target])
y_raw = df_raw[target]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_raw, y_raw, test_size=0.2, random_state=SEED)

# LGBM Pipeline
model_raw = lgb.LGBMRegressor(
    n_estimators=500, learning_rate=0.1, num_leaves=31, random_state=SEED, n_jobs=-1, verbose=-1
)

model_raw.fit(X_train_r, y_train_r)

# --- 3. EVALUATION (RAW) ---
y_pred_r = model_raw.predict(X_test_r)

metrics_raw = {
    "MAE": mean_absolute_error(y_test_r, y_pred_r),
    "RMSE": np.sqrt(mean_squared_error(y_test_r, y_pred_r)),
    "R2": r2_score(y_test_r, y_pred_r),
}

print(f"\nðŸ“Š RAW MODEL RESULTS:")
print(f"   MAE:  ${metrics_raw['MAE']:.2f}")
print(f"   RMSE: ${metrics_raw['RMSE']:.2f}")
print(f"   R2:    {metrics_raw['R2']:.4f}")

# Save for comparison later
results_store = {"Raw": metrics_raw}
predictions_store = {"Raw": (y_test_r, y_pred_r)}


ðŸ“¦ Loading Raw Data...
ðŸš€ Training Raw Model (LightGBM)...

ðŸ“Š RAW MODEL RESULTS:
   MAE:  $12.61
   RMSE: $19.81
   R2:    0.0512


In [8]:
# --- CONFIGURATION ---
PROCESSED_FILE_PATH = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Samples\tlc_sample_2024_processed.parquet"

# --- 1. DATA LOADING (PROCESSED) ---
print("ðŸ“¦ Loading Processed Data...")
lf_proc = pl.scan_parquet(PROCESSED_FILE_PATH)

# --- 2. FEATURE SELECTION (The Strict Filter) ---
# We explicitly list ONLY features known before the trip starts.

# A. Spatial (Geometric & Categorical)
# Note: 'straight_line_dist_km' is allowed (It's just Map Math, not Odometer)
feat_spatial = [
    "PULocationID",
    "DOLocationID",
    "pickup_borough",
    "dropoff_borough",
    "borough_flow_type",
    "trip_archetype",
    "straight_line_dist_km",
    "bearing_degrees",
]

# B. Temporal (Cyclical & Cultural)
# We use the Pre-Calculated engineered features
feat_temporal = [
    "cyclical_hour_sin",
    "cyclical_hour_cos",
    "cyclical_day_sin",
    "cyclical_day_cos",
    "cyclical_month_sin",
    "cyclical_month_cos",
    "cultural_day_type",
    "time_of_day_bin",
]

# C. Weather (Context)
# Forecasts are known.
feat_weather = ["temp", "is_bad_weather", "is_extreme_weather", "weather_state", "visibility_status"]

# D. Request Flags
feat_flags = ["wav_request_flag", "shared_request_flag"]

# Combine
proc_features = feat_spatial + feat_temporal + feat_weather + feat_flags
target = "base_passenger_fare"

# --- 3. PREPARATION ---
df_proc = (
    lf_proc.select(proc_features + [target])
    .filter(pl.col(target) > 0)
    .collect()
    .sample(n=SAMPLE_SIZE, seed=SEED)
    .with_columns([
        # Cast Categoricals for LightGBM
        pl.col("PULocationID").cast(pl.Categorical),
        pl.col("DOLocationID").cast(pl.Categorical),
        pl.col("pickup_borough").cast(pl.Categorical),
        pl.col("dropoff_borough").cast(pl.Categorical),
        pl.col("borough_flow_type").cast(pl.Categorical),
        pl.col("trip_archetype").cast(pl.Categorical),
        pl.col("cultural_day_type").cast(pl.Categorical),
        pl.col("time_of_day_bin").cast(pl.Categorical),
        pl.col("weather_state").cast(pl.Categorical),
        pl.col("visibility_status").cast(pl.Categorical),
        # Ensure Numerical Stability
        pl.col("straight_line_dist_km").fill_null(0),
        pl.col("bearing_degrees").fill_null(-1),
    ])
    .to_pandas()
)

# --- 4. TRAINING (PROCESSED) ---
print(f"ðŸš€ Training Processed Model (Features: {len(proc_features)})...")

X_proc = df_proc.drop(columns=[target])
y_proc = df_proc[target]

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_proc, y_proc, test_size=0.2, random_state=SEED)

# Same Hyperparameters for Fair Comparison
model_proc = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=31,
    random_state=SEED,
    n_jobs=-1,
    verbose=-1,
)

model_proc.fit(X_train_p, y_train_p)

# --- 5. EVALUATION ---
y_pred_p = model_proc.predict(X_test_p)

metrics_proc = {
    "MAE": mean_absolute_error(y_test_p, y_pred_p),
    "RMSE": np.sqrt(mean_squared_error(y_test_p, y_pred_p)),
    "R2": r2_score(y_test_p, y_pred_p),
}

print(f"\nðŸ“Š PROCESSED MODEL RESULTS:")
print(f"   MAE:  ${metrics_proc['MAE']:.2f}")
print(f"   RMSE: ${metrics_proc['RMSE']:.2f}")
print(f"   R2:    {metrics_proc['R2']:.4f}")

# Store Results
results_store["Processed"] = metrics_proc
predictions_store["Processed"] = (y_test_p, y_pred_p)

# --- 6. IMPACT ANALYSIS ---
delta_r2 = metrics_proc["R2"] - metrics_raw["R2"]
pct_imp_mae = ((metrics_raw["MAE"] - metrics_proc["MAE"]) / metrics_raw["MAE"]) * 100

print(f"\nðŸ“ˆ IMPACT SUMMARY:")
print(f"   R2 Improvement: +{delta_r2:.4f} (Absolute)")
print(f"   Error Reduction (MAE): {pct_imp_mae:.1f}%")


ðŸ“¦ Loading Processed Data...
ðŸš€ Training Processed Model (Features: 23)...

ðŸ“Š PROCESSED MODEL RESULTS:
   MAE:  $5.78
   RMSE: $9.97
   R2:    0.7293

ðŸ“ˆ IMPACT SUMMARY:
   R2 Improvement: +0.6781 (Absolute)
   Error Reduction (MAE): 54.2%


In [26]:
# --- PLOTTING ---
# 1. Metric Comparison
fig_metrics = go.Figure()

metrics_names = ["MAE", "RMSE"]
raw_vals = [metrics_raw["MAE"], metrics_raw["RMSE"]]
proc_vals = [metrics_proc["MAE"], metrics_proc["RMSE"]]

fig_metrics.add_trace(
    go.Bar(
        x=metrics_names,
        y=raw_vals,
        name="Raw Model",
        marker_color=ub.UBER_BLACK,
        opacity=0.3,
        text=raw_vals,
        texttemplate="$%{text:.2f}",
        textposition="auto",
    )
)

fig_metrics.add_trace(
    go.Bar(
        x=metrics_names,
        y=proc_vals,
        name="Processed Model",
        marker_color=ub.UBER_GREEN,
        text=proc_vals,
        texttemplate="$%{text:.2f}",
        textposition="auto",
    )
)

fig_metrics = ub.apply_uber_branding(
    fig_metrics,
    title="<b>Model Accuracy: The Power of Engineering</b>",
    subtitle="Lower Error = Better Prediction of Fare",
    source="LightGBM Regression (2024 Sample)",
)
fig_metrics.update_layout(height=600)
fig_metrics.show()
fig_metrics.write_image(
    "modeling_raw_vs_processed_mae_rmse.png",
    height=600,
    width=1300,
    scale=8,
)
fig_metrics.write_html("modeling_raw_vs_processed_mae_rmse.html")

In [31]:
# 2. Actual vs Predicted (Subsampled for speed)
# We take a slice of 2000 points to keep the scatter clean
subset = 10000
y_true_r, y_pred_r = predictions_store["Raw"]
y_true_p, y_pred_p = predictions_store["Processed"]

fig_scatter = make_subplots(
    rows=1, cols=2, subplot_titles=("<b>Raw Model (RÂ²=0.05)</b>", "<b>Processed Model (RÂ²=0.73)</b>")
)

# Raw
fig_scatter.add_trace(
    go.Scatter(
        x=y_true_r[:subset],
        y=y_pred_r[:subset],
        mode="markers",
        marker=dict(color=ub.UBER_BLACK, size=4, opacity=0.3),
        name="Raw",
    ),
    row=1,
    col=1,
)

# Processed
fig_scatter.add_trace(
    go.Scatter(
        x=y_true_p[:subset],
        y=y_pred_p[:subset],
        mode="markers",
        marker=dict(color=ub.UBER_GREEN, size=4, opacity=0.5),
        name="Processed",
    ),
    row=1,
    col=2,
)

# Reference Lines (Perfect Prediction)
for col in [1, 2]:
    fig_scatter.add_shape(type="line", x0=0, y0=0, x1=150, y1=150, line=dict(color="red", dash="dash"), row=1, col=col)

fig_scatter.update_layout(height=500, showlegend=False, template="uber")
fig_scatter.update_xaxes(title_text="Actual Fare ($)", range=[0, 100])
fig_scatter.update_yaxes(title_text="Predicted Fare ($)", range=[0, 100])
fig_scatter = ub.apply_uber_branding(fig_scatter, title="<b>Prediction Fidelity</b>")
fig_scatter.update_layout(height=650, width=1400)
fig_scatter.show()
fig_scatter.write_image(
    "modeling_raw_vs_processed_actual_vs_predicted_r2.png",
    height=650,
    width=1400,
    scale=8,
)
fig_scatter.write_html("modeling_raw_vs_processed_actual_vs_predicted_r2.html")


In [None]:
# 3. Feature Importance (Processed Only)
importance = (
    pd.DataFrame({"Feature": X_proc.columns, "Importance": model_proc.feature_importances_})
    .sort_values("Importance", ascending=True)
    .tail(10)
)

fig_imp = go.Figure(
    go.Bar(x=importance["Importance"], y=importance["Feature"], orientation="h", marker_color=ub.GRAY_700)
)

fig_imp = ub.apply_uber_branding(
    fig_imp,
    title="<b>Winning Engineered Features</b>",
    subtitle="Which signals drive the price? <i>(Only taking into account the engineered features)</i>",
)
fig_imp.update_layout(
    margin=dict(l=160),
    height=600,
)
fig_imp.show()

fig_imp.write_image(
    "modeling_raw_vs_processed_engineered_features_importance.png",
    height=600,
    width=1400,
    scale=8,
)
fig_imp.write_html("modeling_raw_vs_processed_engineered_features_importance.html")
