In [7]:
import os, sys
os.getcwd() #current working directory, path
sys.path.append('c:\\Projects\\taxi_predict\\src')


In [None]:
'''
Hyper-parameters are parameters that we need to fix before we train our model.
We need to pass them. (if we dont pass them we are telling the model to use the default one).

So we split our data:
- trianing data and validation data
(
1. Training data:
We find the parameters
    model.fit(x_train, y_trian)

2. Validation data:
Then we get validation error. We predict.
    y_pred = model.predict(x_val)
    mae(y_val, y_pred)

)
- test data
'''

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-29,1,0.0
1,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,2.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2023-01-30,1,0.0
2,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,2.0,0.0,1.0,0.0,0.0,0.0,2023-01-31,1,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-02-01,1,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2023-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89300,1.0,0.0,2.0,0.0,0.0,3.0,3.0,0.0,3.0,2.0,...,2.0,2.0,1.0,2.0,3.0,1.0,3.0,2023-12-27,265,3.0
89301,5.0,7.0,2.0,1.0,0.0,1.0,2.0,2.0,3.0,3.0,...,6.0,4.0,2.0,4.0,10.0,3.0,3.0,2023-12-28,265,1.0
89302,5.0,3.0,2.0,3.0,1.0,3.0,1.0,5.0,3.0,1.0,...,3.0,1.0,8.0,5.0,1.0,0.0,6.0,2023-12-29,265,5.0
89303,3.0,4.0,9.0,4.0,1.0,2.0,0.0,0.0,0.0,2.0,...,6.0,3.0,2.0,2.0,5.0,1.0,5.0,2023-12-30,265,2.0


In [10]:
from datetime import datetime
from data_split import train_test_split

x_train, y_train, x_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2023, 6, 1, 0, 0, 0),
    target_column_name="target_rides_next_hour"
)

print(f'{x_train.shape=}')
print(f'{y_train.shape=}')
print(f'{x_test.shape=}')
print(f'{y_test.shape=}')

x_train.shape=(32595, 674)
y_train.shape=(32595,)
x_test.shape=(56710, 674)
y_test.shape=(56710,)


In [11]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna #we use optuna for hyperparameters

from model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    
    # sort X_train by `pikup_hour` inplace
    # so the TimeSeriesSplit will split the data in a consistent way
    x_train.sort_values('pickup_hour', inplace=True)

    for train_index, val_index in tss.split(x_train):

        # split data for training and validation
        x_train_, x_val = x_train.iloc[train_index, :], x_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(x_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(x_val)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-03-14 14:30:31,823] A new study created in memory with name: no-name-a8117bf8-d586-4466-bfc9-6a376562f367
[I 2024-03-14 14:31:05,269] Trial 0 finished with value: 18.194413918447637 and parameters: {'num_leaves': 208, 'feature_fraction': 0.9831652423745589, 'bagging_fraction': 0.9297715613387425, 'min_child_samples': 96}. Best is trial 0 with value: 18.194413918447637.
[I 2024-03-14 14:31:13,524] Trial 1 finished with value: 17.74294833181788 and parameters: {'num_leaves': 38, 'feature_fraction': 0.33961353937511696, 'bagging_fraction': 0.9347451440755139, 'min_child_samples': 81}. Best is trial 1 with value: 17.74294833181788.
[I 2024-03-14 14:31:31,638] Trial 2 finished with value: 18.02705795394459 and parameters: {'num_leaves': 110, 'feature_fraction': 0.5368024839989147, 'bagging_fraction': 0.3001783813560399, 'min_child_samples': 94}. Best is trial 1 with value: 17.74294833181788.
[I 2024-03-14 14:31:52,401] Trial 3 finished with value: 17.907892475877222 and parameters: 

In [13]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 38, 'feature_fraction': 0.33961353937511696, 'bagging_fraction': 0.9347451440755139, 'min_child_samples': 81}


In [14]:
pipeline = get_pipeline(**best_params)
pipeline.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156147
[LightGBM] [Info] Number of data points in the train set: 32595, number of used features: 675
[LightGBM] [Info] Start training from score 11.289768


In [15]:
predictions = pipeline.predict(x_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=18.6810


In [16]:
from plot import plot_one_sample

plot_one_sample(
    example_id=2979,
    features=x_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [41]:
plot_one_sample(
    example_id=1300,
    features=x_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)