In [35]:
%reload_ext autoreload
%autoreload 2

In [36]:
import sys
sys.path.append('/home/mcamara/taxi-demand-predictor/')

In [37]:

import warnings
warnings.filterwarnings("ignore")

In [38]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81085,6.0,7.0,7.0,4.0,9.0,1.0,3.0,5.0,4.0,7.0,...,5.0,8.0,7.0,2.0,1.0,5.0,1.0,2022-11-26,265,2.0
81086,10.0,5.0,4.0,11.0,8.0,2.0,2.0,5.0,7.0,8.0,...,6.0,2.0,1.0,3.0,2.0,8.0,4.0,2022-11-27,265,3.0
81087,6.0,2.0,2.0,0.0,3.0,7.0,24.0,20.0,10.0,18.0,...,5.0,5.0,4.0,4.0,8.0,1.0,7.0,2022-11-28,265,10.0
81088,4.0,6.0,0.0,1.0,1.0,7.0,22.0,18.0,25.0,17.0,...,10.0,11.0,13.0,3.0,4.0,3.0,1.0,2022-11-29,265,3.0


In [39]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(48495, 674)
y_test.shape=(48495,)


In [40]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from  catboost import CatBoostRegressor
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        'iterations': trial.suggest_int('iterations', 100, 1000, step=100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10),
        'border_count': trial.suggest_int('border_count', 32, 255, step=32),
        'eval_metric': 'MAE',
        'random_seed': 42,
        'thread_count': -1,
        'verbose': True   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        #regressor = CatBoostRegressor()
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [41]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)


[32m[I 2023-03-16 23:37:55,551][0m A new study created in memory with name: no-name-ba4b1983-4ca9-4b6d-bf3f-703a4c551e36[0m


0:	learn: 10.9993471	total: 102ms	remaining: 50.7s
1:	learn: 10.5667446	total: 183ms	remaining: 45.6s
2:	learn: 10.1251472	total: 213ms	remaining: 35.3s
3:	learn: 9.6985185	total: 268ms	remaining: 33.2s
4:	learn: 9.2770200	total: 310ms	remaining: 30.7s
5:	learn: 8.8915772	total: 354ms	remaining: 29.2s
6:	learn: 8.5255791	total: 392ms	remaining: 27.6s
7:	learn: 8.1891402	total: 427ms	remaining: 26.3s
8:	learn: 7.8556682	total: 468ms	remaining: 25.5s
9:	learn: 7.5400606	total: 501ms	remaining: 24.6s
10:	learn: 7.2439459	total: 546ms	remaining: 24.3s
11:	learn: 6.9598360	total: 597ms	remaining: 24.3s
12:	learn: 6.6831761	total: 630ms	remaining: 23.6s
13:	learn: 6.4256646	total: 657ms	remaining: 22.8s
14:	learn: 6.1830091	total: 685ms	remaining: 22.2s
15:	learn: 5.9296130	total: 736ms	remaining: 22.3s
16:	learn: 5.6896292	total: 768ms	remaining: 21.8s
17:	learn: 5.4780832	total: 805ms	remaining: 21.6s
18:	learn: 5.2814773	total: 843ms	remaining: 21.3s
19:	learn: 5.0809394	total: 872ms	rema

[32m[I 2023-03-16 23:38:33,387][0m Trial 0 finished with value: 3.367006407741927 and parameters: {'iterations': 500, 'learning_rate': 0.047662886865868066, 'depth': 6, 'l2_leaf_reg': 4.755672336791144e-08, 'border_count': 32}. Best is trial 0 with value: 3.367006407741927.[0m


0:	learn: 10.7377489	total: 85ms	remaining: 1m 7s
1:	learn: 10.0893702	total: 110ms	remaining: 44s
2:	learn: 9.4474331	total: 291ms	remaining: 1m 17s
3:	learn: 8.8938428	total: 486ms	remaining: 1m 36s
4:	learn: 8.3889010	total: 615ms	remaining: 1m 37s
5:	learn: 7.8691184	total: 663ms	remaining: 1m 27s
6:	learn: 7.3827863	total: 699ms	remaining: 1m 19s
7:	learn: 6.9784969	total: 732ms	remaining: 1m 12s
8:	learn: 6.5581198	total: 760ms	remaining: 1m 6s
9:	learn: 6.1726733	total: 785ms	remaining: 1m 1s
10:	learn: 5.8171262	total: 810ms	remaining: 58.1s
11:	learn: 5.4563682	total: 843ms	remaining: 55.4s
12:	learn: 5.1514517	total: 913ms	remaining: 55.2s
13:	learn: 4.8980391	total: 1.13s	remaining: 1m 3s
14:	learn: 4.6267573	total: 1.33s	remaining: 1m 9s
15:	learn: 4.3593779	total: 1.42s	remaining: 1m 9s
16:	learn: 4.1219149	total: 1.47s	remaining: 1m 7s
17:	learn: 3.9291752	total: 1.53s	remaining: 1m 6s
18:	learn: 3.7323125	total: 1.57s	remaining: 1m 4s
19:	learn: 3.5523762	total: 1.62s	re

[32m[I 2023-03-16 23:39:23,907][0m Trial 1 finished with value: 3.5615655750085087 and parameters: {'iterations': 800, 'learning_rate': 0.07490191547144244, 'depth': 5, 'l2_leaf_reg': 4.273633472888237e-07, 'border_count': 96}. Best is trial 0 with value: 3.367006407741927.[0m


0:	learn: 11.4756894	total: 61.3ms	remaining: 42.9s
1:	learn: 11.4606140	total: 77.8ms	remaining: 27.1s
2:	learn: 11.4455827	total: 92.2ms	remaining: 21.4s
3:	learn: 11.4310896	total: 107ms	remaining: 18.5s
4:	learn: 11.4158524	total: 120ms	remaining: 16.7s
5:	learn: 11.4004116	total: 151ms	remaining: 17.4s
6:	learn: 11.3857799	total: 183ms	remaining: 18.1s
7:	learn: 11.3708229	total: 205ms	remaining: 17.7s
8:	learn: 11.3561801	total: 222ms	remaining: 17.1s
9:	learn: 11.3416857	total: 236ms	remaining: 16.3s
10:	learn: 11.3263438	total: 250ms	remaining: 15.7s
11:	learn: 11.3111689	total: 267ms	remaining: 15.3s
12:	learn: 11.2968273	total: 285ms	remaining: 15.1s
13:	learn: 11.2819806	total: 311ms	remaining: 15.2s
14:	learn: 11.2673599	total: 338ms	remaining: 15.4s
15:	learn: 11.2529663	total: 365ms	remaining: 15.6s
16:	learn: 11.2382659	total: 382ms	remaining: 15.3s
17:	learn: 11.2240879	total: 398ms	remaining: 15.1s
18:	learn: 11.2102562	total: 415ms	remaining: 14.9s
19:	learn: 11.19574

[32m[I 2023-03-16 23:39:55,327][0m Trial 2 finished with value: 7.329846956305612 and parameters: {'iterations': 700, 'learning_rate': 0.0016172308944843409, 'depth': 4, 'l2_leaf_reg': 4.682457026598707e-08, 'border_count': 128}. Best is trial 0 with value: 3.367006407741927.[0m


699:	learn: 7.1926983	total: 16.3s	remaining: 0us
0:	learn: 11.2282772	total: 76ms	remaining: 53.1s
1:	learn: 10.9945131	total: 94.7ms	remaining: 33s
2:	learn: 10.7466666	total: 113ms	remaining: 26.4s
3:	learn: 10.5141492	total: 133ms	remaining: 23.1s
4:	learn: 10.2887981	total: 152ms	remaining: 21.1s
5:	learn: 10.0582513	total: 172ms	remaining: 19.9s
6:	learn: 9.8296877	total: 191ms	remaining: 18.9s
7:	learn: 9.6211368	total: 210ms	remaining: 18.1s
8:	learn: 9.4140559	total: 229ms	remaining: 17.6s
9:	learn: 9.1993985	total: 248ms	remaining: 17.1s
10:	learn: 8.9864737	total: 268ms	remaining: 16.8s
11:	learn: 8.8026218	total: 291ms	remaining: 16.7s
12:	learn: 8.6110637	total: 316ms	remaining: 16.7s
13:	learn: 8.4322134	total: 344ms	remaining: 16.9s
14:	learn: 8.2595397	total: 369ms	remaining: 16.8s
15:	learn: 8.0717290	total: 394ms	remaining: 16.8s
16:	learn: 7.9136247	total: 414ms	remaining: 16.6s
17:	learn: 7.7602714	total: 435ms	remaining: 16.5s
18:	learn: 7.5919685	total: 454ms	rema

[32m[I 2023-03-16 23:40:33,041][0m Trial 3 finished with value: 3.4022208895852692 and parameters: {'iterations': 700, 'learning_rate': 0.026175942286044605, 'depth': 5, 'l2_leaf_reg': 0.22632291875729513, 'border_count': 96}. Best is trial 0 with value: 3.367006407741927.[0m


0:	learn: 11.4787552	total: 182ms	remaining: 1m 49s
1:	learn: 11.4674860	total: 257ms	remaining: 1m 16s
2:	learn: 11.4563654	total: 352ms	remaining: 1m 10s
3:	learn: 11.4445770	total: 457ms	remaining: 1m 8s
4:	learn: 11.4337397	total: 556ms	remaining: 1m 6s
5:	learn: 11.4230145	total: 620ms	remaining: 1m 1s
6:	learn: 11.4119857	total: 680ms	remaining: 57.6s
7:	learn: 11.4016616	total: 755ms	remaining: 55.8s
8:	learn: 11.3900191	total: 816ms	remaining: 53.6s
9:	learn: 11.3784898	total: 883ms	remaining: 52.1s
10:	learn: 11.3678667	total: 973ms	remaining: 52.1s
11:	learn: 11.3563080	total: 1.04s	remaining: 51s
12:	learn: 11.3455278	total: 1.1s	remaining: 49.9s
13:	learn: 11.3343930	total: 1.19s	remaining: 49.8s
14:	learn: 11.3227469	total: 1.25s	remaining: 48.8s
15:	learn: 11.3111296	total: 1.31s	remaining: 48s
16:	learn: 11.3000404	total: 1.4s	remaining: 48s
17:	learn: 11.2890797	total: 1.47s	remaining: 47.4s
18:	learn: 11.2775704	total: 1.52s	remaining: 46.6s
19:	learn: 11.2671458	total

[32m[I 2023-03-16 23:41:58,455][0m Trial 4 finished with value: 10.095892154960072 and parameters: {'iterations': 600, 'learning_rate': 0.0011651548993416667, 'depth': 6, 'l2_leaf_reg': 1.5615371066082492e-05, 'border_count': 192}. Best is trial 0 with value: 3.367006407741927.[0m


In [42]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'iterations': 500, 'learning_rate': 0.047662886865868066, 'depth': 6, 'l2_leaf_reg': 4.755672336791144e-08, 'border_count': 32}


In [43]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0:	learn: 34.5102993	total: 150ms	remaining: 1m 14s
1:	learn: 33.0908148	total: 229ms	remaining: 57.1s
2:	learn: 31.7708295	total: 301ms	remaining: 49.8s
3:	learn: 30.4889485	total: 399ms	remaining: 49.5s
4:	learn: 29.2509518	total: 508ms	remaining: 50.3s
5:	learn: 28.0933925	total: 601ms	remaining: 49.5s
6:	learn: 26.9695247	total: 699ms	remaining: 49.2s
7:	learn: 25.9276455	total: 774ms	remaining: 47.6s
8:	learn: 24.9037738	total: 861ms	remaining: 47s
9:	learn: 23.9394942	total: 933ms	remaining: 45.7s
10:	learn: 23.0634586	total: 1s	remaining: 44.7s
11:	learn: 22.1964579	total: 1.1s	remaining: 44.7s
12:	learn: 21.3818937	total: 1.2s	remaining: 44.9s
13:	learn: 20.6042159	total: 1.29s	remaining: 45s
14:	learn: 19.8532680	total: 1.42s	remaining: 46s
15:	learn: 19.1222655	total: 1.49s	remaining: 45.1s
16:	learn: 18.4229184	total: 1.57s	remaining: 44.7s
17:	learn: 17.8000317	total: 1.63s	remaining: 43.7s
18:	learn: 17.1673166	total: 1.75s	remaining: 44.3s
19:	learn: 16.5782330	total: 1.8

In [44]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5400


In [45]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=2979,
    predictions=pd.Series(predictions)
)