In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR/"tabular_data.parquet")
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-27,199,0.0
88290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-28,199,0.0
88291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-29,199,0.0
88292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-30,199,0.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date= datetime(2022, 6, 1, 0, 0, 0),
    target_column_name= "target_rides_next_hour"
)

print(f"{X_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_train.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(32226, 674)
X_test.shape=(56068, 674)
y_train.shape=(32226,)
y_test.shape=(56068,)


In [19]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial : optuna.trial.Trial) ->float :
    """"
    Given a set of hyperparameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """

    # pick hyper-parameters

    hyperparams = {
        "metric" : "mae",
        "verbose" : -1,
        "num_leaves" : trial.suggest_int("num_leaves", 2, 256),
        "features_fraction" : trial.suggest_float("features_fraction", 0.2, 1.0),
        "bagging_fraction" : trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples" : trial.suggest_int("min_child_samples", 3, 100)
    }


    tss=TimeSeriesSplit(n_splits=4)
    scores = []

    for train_index, val_index in tss.split(X_train):

        # split the data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index,:], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_,y_train_)

        #evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae=mean_absolute_error(y_val_,y_pred)

        scores.append(mae)

        return np.array(scores).mean()


In [20]:
study = optuna.create_study (direction="minimize")
study.optimize(objective, n_trials=5, n_jobs=-1)

[I 2024-05-06 18:36:07,656] A new study created in memory with name: no-name-68a5d449-2cc0-4e1a-a8d1-db75b72e98e4
[I 2024-05-06 18:36:24,578] Trial 1 finished with value: 5.311711197326946 and parameters: {'num_leaves': 53, 'features_fraction': 0.22410165390918682, 'bagging_fraction': 0.4024835862099092, 'min_child_samples': 73}. Best is trial 1 with value: 5.311711197326946.
[I 2024-05-06 18:36:28,698] Trial 4 finished with value: 5.466420291723877 and parameters: {'num_leaves': 71, 'features_fraction': 0.5344078729876447, 'bagging_fraction': 0.9099751757756438, 'min_child_samples': 43}. Best is trial 1 with value: 5.311711197326946.
[I 2024-05-06 18:36:30,635] Trial 3 finished with value: 5.431665163750779 and parameters: {'num_leaves': 84, 'features_fraction': 0.2298471978425467, 'bagging_fraction': 0.9035566518393721, 'min_child_samples': 19}. Best is trial 1 with value: 5.311711197326946.
[I 2024-05-06 18:36:42,760] Trial 0 finished with value: 5.383364909184525 and parameters: {'

In [22]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 53, 'features_fraction': 0.22410165390918682, 'bagging_fraction': 0.4024835862099092, 'min_child_samples': 73}


In [23]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.210226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32226, number of used features: 675
[LightGBM] [Info] Start training from score 11.703562


In [24]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5916


In [28]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=2900,
    predictions=pd.Series(predictions)
)

SyntaxError: non-default argument follows default argument (plot.py, line 10)