In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "transformed_features_and_target_top50.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,temp_t-1,wind_speed_t-3,wind_speed_t-2,wind_speed_t-1,precipitation_t-3,precipitation_t-2,precipitation_t-1,target,start_station_id,hour
0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,3.0,6.0,...,,,,,,,,13,A32002,2024-10-28 22:00:00
1,0.0,0.0,3.0,1.0,0.0,0.0,0.0,3.0,6.0,8.0,...,,,,,,,,3,A32002,2024-10-28 23:00:00
2,0.0,3.0,1.0,0.0,0.0,0.0,3.0,6.0,8.0,6.0,...,,,,,,,,1,A32002,2024-10-29 00:00:00
3,3.0,1.0,0.0,0.0,0.0,3.0,6.0,8.0,6.0,8.0,...,,,,,,,,1,A32002,2024-10-29 01:00:00
4,1.0,0.0,0.0,0.0,3.0,6.0,8.0,6.0,8.0,4.0,...,,,,,,,,0,A32002,2024-10-29 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404579,16.0,9.0,9.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,...,,,,,,,,16,M32085,2025-09-30 19:00:00
404580,9.0,9.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,...,,,,,,,,11,M32085,2025-09-30 20:00:00
404581,9.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,,,,,,,,6,M32085,2025-09-30 21:00:00
404582,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,5.0,...,,,,,,,,9,M32085,2025-09-30 22:00:00


In [3]:
df["hour"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 404584 entries, 0 to 404583
Series name: hour
Non-Null Count   Dtype         
--------------   -----         
404584 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 3.1 MB


In [5]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2025, 7, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(294163, 683)
(294163,)
(110421, 683)
(110421,)


In [6]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*6}",  # 1 week ago
            f"rides_t-{14*6}", # 2 weeks ago
            f"rides_t-{21*6}", # 3 weeks ago
            f"rides_t-{28*6}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour_of_day"] = X_["hour"].dt.hour
        X_["day_of_week"] = X_["hour"].dt.dayofweek

        return X_.drop(columns=["hour", "start_station_id"])

add_temporal_features = TemporalFeatureEngineer()


In [9]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [10]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)


In [11]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# Define the parameter grid for LGBMRegressor
param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "lgbmregressor__n_estimators": [100, 200, 500, 1000],
    "lgbmregressor__min_child_samples": [10, 20, 30, 50],
    "lgbmregressor__subsample": [0.6, 0.8, 1.0],
    "lgbmregressor__colsample_bytree": [0.6, 0.8, 1.0],
    "lgbmregressor__reg_alpha": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__reg_lambda": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_freq": [1, 5, 10],
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,  # Number of parameter settings sampled
    scoring="neg_mean_absolute_error",  # Use MAE as the scoring metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40674
[LightGBM] [Info] Number of data points in the train set: 196108, number of used features: 675
[LightGBM] [Info] Start training from score 3.044792
[CV] END lgbmregressor__bagging_fraction=0.8, lgbmregressor__bagging_freq=10, lgbmregressor__colsample_bytree=1.0, lgbmregressor__feature_fraction=0.6, lgbmregressor__learning_rate=0.1, lgbmregressor__max_depth=20, lgbmregressor__min_child_samples=50, lgbmregressor__n_estimators=1000, lgbmregressor__num_leaves=2, lgbmregressor__reg_alpha=0.5, lgbmregressor__reg_lambda=0.1, lgbmregressor__subsample=0.8; total time=  25.6s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021221 seconds.
Yo