In [24]:
%reload_ext autoreload
%autoreload 2

In [25]:
import sys
sys.path.append('/home/mcamara/taxi-demand-predictor/')

In [26]:

import warnings
warnings.filterwarnings("ignore")

In [27]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81085,6.0,7.0,7.0,4.0,9.0,1.0,3.0,5.0,4.0,7.0,...,5.0,8.0,7.0,2.0,1.0,5.0,1.0,2022-11-26,265,2.0
81086,10.0,5.0,4.0,11.0,8.0,2.0,2.0,5.0,7.0,8.0,...,6.0,2.0,1.0,3.0,2.0,8.0,4.0,2022-11-27,265,3.0
81087,6.0,2.0,2.0,0.0,3.0,7.0,24.0,20.0,10.0,18.0,...,5.0,5.0,4.0,4.0,8.0,1.0,7.0,2022-11-28,265,10.0
81088,4.0,6.0,0.0,1.0,1.0,7.0,22.0,18.0,25.0,17.0,...,10.0,11.0,13.0,3.0,4.0,3.0,1.0,2022-11-29,265,3.0


In [28]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(48495, 674)
y_test.shape=(48495,)


In [29]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from  catboost import CatBoostRegressor
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        'iterations': trial.suggest_int('iterations', 100, 1000, step=100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10),
        'border_count': trial.suggest_int('border_count', 32, 255, step=32),
        'eval_metric': 'MAE',
        'random_seed': 42,
        'thread_count': -1,
        'verbose': True   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        #regressor = CatBoostRegressor()
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [30]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)


[32m[I 2023-03-16 22:22:47,529][0m A new study created in memory with name: no-name-742d8ee6-3100-4655-b8ad-93c11e26c66e[0m


0:	learn: 35.2063205	total: 1.23s	remaining: 8m 9s
1:	learn: 34.8201473	total: 2.32s	remaining: 7m 40s
2:	learn: 34.4404757	total: 3.34s	remaining: 7m 22s
3:	learn: 34.0833040	total: 4.48s	remaining: 7m 23s
4:	learn: 33.7316386	total: 5.53s	remaining: 7m 17s
5:	learn: 33.3625266	total: 6.67s	remaining: 7m 18s
6:	learn: 33.0043142	total: 7.81s	remaining: 7m 18s
7:	learn: 32.6492456	total: 9.03s	remaining: 7m 22s
8:	learn: 32.3039621	total: 10.1s	remaining: 7m 17s
9:	learn: 31.9750642	total: 11.2s	remaining: 7m 14s
10:	learn: 31.6363264	total: 12.2s	remaining: 7m 11s
11:	learn: 31.3047579	total: 13.1s	remaining: 7m 4s
12:	learn: 30.9520737	total: 14.2s	remaining: 7m 1s
13:	learn: 30.6082869	total: 15.1s	remaining: 6m 56s
14:	learn: 30.2732760	total: 16.3s	remaining: 6m 59s
15:	learn: 29.9331388	total: 17.6s	remaining: 7m 3s
16:	learn: 29.6039881	total: 18.7s	remaining: 7m
17:	learn: 29.3138029	total: 19.7s	remaining: 6m 58s
18:	learn: 29.0137290	total: 20.8s	remaining: 6m 56s
19:	learn: 

[32m[I 2023-03-16 22:38:50,092][0m Trial 0 finished with value: 4.019203425201351 and parameters: {'iterations': 400, 'learning_rate': 0.013265920088712154, 'depth': 10, 'l2_leaf_reg': 1.8856647286019512, 'border_count': 224}. Best is trial 0 with value: 4.019203425201351.[0m


0:	learn: 35.0391578	total: 600ms	remaining: 2m 59s
1:	learn: 34.5111429	total: 1.37s	remaining: 3m 24s
2:	learn: 33.9838868	total: 2.09s	remaining: 3m 26s
3:	learn: 33.4709744	total: 2.7s	remaining: 3m 19s
4:	learn: 32.9629383	total: 3.38s	remaining: 3m 19s
5:	learn: 32.4746244	total: 4.03s	remaining: 3m 17s
6:	learn: 31.9793919	total: 4.69s	remaining: 3m 16s
7:	learn: 31.5021940	total: 5.3s	remaining: 3m 13s
8:	learn: 31.0366540	total: 5.95s	remaining: 3m 12s
9:	learn: 30.5748951	total: 6.64s	remaining: 3m 12s
10:	learn: 30.1232540	total: 7.3s	remaining: 3m 11s
11:	learn: 29.6618467	total: 7.93s	remaining: 3m 10s
12:	learn: 29.2164437	total: 8.7s	remaining: 3m 12s
13:	learn: 28.7752204	total: 9.33s	remaining: 3m 10s
14:	learn: 28.3428030	total: 9.99s	remaining: 3m 9s
15:	learn: 27.9185151	total: 10.6s	remaining: 3m 8s
16:	learn: 27.4997885	total: 11.2s	remaining: 3m 6s
17:	learn: 27.0835598	total: 12s	remaining: 3m 8s
18:	learn: 26.6924447	total: 12.6s	remaining: 3m 6s
19:	learn: 26.

[32m[I 2023-03-16 22:45:53,176][0m Trial 1 finished with value: 3.8303472162208285 and parameters: {'iterations': 300, 'learning_rate': 0.01667787962236246, 'depth': 10, 'l2_leaf_reg': 0.22912520003994416, 'border_count': 128}. Best is trial 1 with value: 3.8303472162208285.[0m


0:	learn: 34.9035346	total: 904ms	remaining: 4m 30s
1:	learn: 34.2377617	total: 1.7s	remaining: 4m 14s
2:	learn: 33.5837319	total: 2.51s	remaining: 4m 8s
3:	learn: 32.9489182	total: 3.45s	remaining: 4m 15s
4:	learn: 32.3264787	total: 4.39s	remaining: 4m 19s
5:	learn: 31.7144309	total: 5.24s	remaining: 4m 16s
6:	learn: 31.1204277	total: 6.2s	remaining: 4m 19s
7:	learn: 30.5346396	total: 7.01s	remaining: 4m 15s
8:	learn: 29.9593947	total: 7.82s	remaining: 4m 12s
9:	learn: 29.3982665	total: 8.7s	remaining: 4m 12s
10:	learn: 28.8490716	total: 9.55s	remaining: 4m 10s
11:	learn: 28.3100992	total: 10.5s	remaining: 4m 12s
12:	learn: 27.7715509	total: 11.4s	remaining: 4m 11s
13:	learn: 27.2457120	total: 12.3s	remaining: 4m 10s
14:	learn: 26.7340417	total: 13.2s	remaining: 4m 11s
15:	learn: 26.2320520	total: 14.2s	remaining: 4m 11s
16:	learn: 25.7398409	total: 14.9s	remaining: 4m 8s
17:	learn: 25.2609247	total: 15.9s	remaining: 4m 9s
18:	learn: 24.7947516	total: 16.7s	remaining: 4m 6s
19:	learn:

[32m[I 2023-03-16 22:54:28,422][0m Trial 2 finished with value: 3.6954513313642 and parameters: {'iterations': 300, 'learning_rate': 0.019963881212373205, 'depth': 10, 'l2_leaf_reg': 0.00038974496598351716, 'border_count': 160}. Best is trial 2 with value: 3.6954513313642.[0m


0:	learn: 35.4832537	total: 66.5ms	remaining: 26.5s
1:	learn: 35.3902221	total: 99.4ms	remaining: 19.8s
2:	learn: 35.2991738	total: 126ms	remaining: 16.6s
3:	learn: 35.2068696	total: 147ms	remaining: 14.5s
4:	learn: 35.1138353	total: 166ms	remaining: 13.2s
5:	learn: 35.0192313	total: 184ms	remaining: 12.1s
6:	learn: 34.9266063	total: 203ms	remaining: 11.4s
7:	learn: 34.8337663	total: 222ms	remaining: 10.9s
8:	learn: 34.7417722	total: 242ms	remaining: 10.5s
9:	learn: 34.6520710	total: 262ms	remaining: 10.2s
10:	learn: 34.5616591	total: 285ms	remaining: 10.1s
11:	learn: 34.4713347	total: 311ms	remaining: 10.1s
12:	learn: 34.3811374	total: 331ms	remaining: 9.87s
13:	learn: 34.2917410	total: 351ms	remaining: 9.69s
14:	learn: 34.2032407	total: 372ms	remaining: 9.54s
15:	learn: 34.1118584	total: 392ms	remaining: 9.4s
16:	learn: 34.0232645	total: 411ms	remaining: 9.26s
17:	learn: 33.9325645	total: 430ms	remaining: 9.13s
18:	learn: 33.8434479	total: 449ms	remaining: 9s
19:	learn: 33.7553420	to

[32m[I 2023-03-16 22:54:51,058][0m Trial 3 finished with value: 7.114559002616745 and parameters: {'iterations': 400, 'learning_rate': 0.002884100703554064, 'depth': 5, 'l2_leaf_reg': 2.0598960982636958e-06, 'border_count': 64}. Best is trial 2 with value: 3.6954513313642.[0m


0:	learn: 35.2000599	total: 41.3ms	remaining: 4.09s
1:	learn: 34.8283880	total: 56.5ms	remaining: 2.77s
2:	learn: 34.4536898	total: 66.7ms	remaining: 2.15s
3:	learn: 34.0868784	total: 78.2ms	remaining: 1.88s
4:	learn: 33.7318460	total: 88.4ms	remaining: 1.68s
5:	learn: 33.3676587	total: 101ms	remaining: 1.58s
6:	learn: 33.0100131	total: 112ms	remaining: 1.48s
7:	learn: 32.6658575	total: 124ms	remaining: 1.42s
8:	learn: 32.3160788	total: 134ms	remaining: 1.36s
9:	learn: 31.9687657	total: 147ms	remaining: 1.32s
10:	learn: 31.6320587	total: 157ms	remaining: 1.27s
11:	learn: 31.3047042	total: 168ms	remaining: 1.23s
12:	learn: 30.9804336	total: 180ms	remaining: 1.2s
13:	learn: 30.6597297	total: 192ms	remaining: 1.18s
14:	learn: 30.3445555	total: 205ms	remaining: 1.16s
15:	learn: 30.0223753	total: 218ms	remaining: 1.14s
16:	learn: 29.7153138	total: 230ms	remaining: 1.12s
17:	learn: 29.4118375	total: 241ms	remaining: 1.1s
18:	learn: 29.1060469	total: 256ms	remaining: 1.09s
19:	learn: 28.79599

[32m[I 2023-03-16 22:54:56,770][0m Trial 4 finished with value: 7.0543373944777095 and parameters: {'iterations': 100, 'learning_rate': 0.011854475590386222, 'depth': 4, 'l2_leaf_reg': 0.0005092872325118907, 'border_count': 32}. Best is trial 2 with value: 3.6954513313642.[0m


In [31]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'iterations': 300, 'learning_rate': 0.019963881212373205, 'depth': 10, 'l2_leaf_reg': 0.00038974496598351716, 'border_count': 160}


In [32]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0:	learn: 35.3306989	total: 1.05s	remaining: 5m 13s
1:	learn: 34.6928889	total: 2.27s	remaining: 5m 38s
2:	learn: 34.0788307	total: 3.69s	remaining: 6m 5s
3:	learn: 33.4633638	total: 4.63s	remaining: 5m 42s
4:	learn: 32.8655630	total: 5.73s	remaining: 5m 37s
5:	learn: 32.2798261	total: 6.76s	remaining: 5m 31s
6:	learn: 31.7069918	total: 7.81s	remaining: 5m 26s
7:	learn: 31.1432469	total: 8.77s	remaining: 5m 20s
8:	learn: 30.5790851	total: 9.79s	remaining: 5m 16s
9:	learn: 30.0369323	total: 11.1s	remaining: 5m 22s
10:	learn: 29.5108862	total: 12.4s	remaining: 5m 25s
11:	learn: 28.9952090	total: 13.7s	remaining: 5m 29s
12:	learn: 28.4926487	total: 14.9s	remaining: 5m 28s
13:	learn: 27.9852878	total: 16.1s	remaining: 5m 28s
14:	learn: 27.4883735	total: 17.4s	remaining: 5m 30s
15:	learn: 27.0035527	total: 19.2s	remaining: 5m 40s
16:	learn: 26.5364565	total: 20.3s	remaining: 5m 37s
17:	learn: 26.0776303	total: 21.7s	remaining: 5m 39s
18:	learn: 25.6320510	total: 23.1s	remaining: 5m 41s
19:	

In [33]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5717
