In [1]:
import pandas as pd
import numpy as np
from paths import CLEANED_DATA_DIR
from datetime import datetime
from data_split import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna
import catboost
import xgboost as xgb

from model import *


# autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
df = pd.read_parquet(CLEANED_DATA_DIR / 'tabular_data.parquet')
df = add_coordinates(df)

In [3]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

tss = TimeSeriesSplit(n_splits=2)
scores = []
for train_index, val_index in tss.split(X_train):

    # split data for training and validation
    X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
    y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

    
    print(f'{X_train_.shape=}')      
    print(f'{y_train_.shape=}')
    print(f'{X_val_.shape=}')
    print(f'{y_val_.shape=}')

X_train.shape=(32349, 676)
y_train.shape=(32349,)
X_test.shape=(56282, 676)
y_test.shape=(56282,)
X_train_.shape=(10783, 676)
y_train_.shape=(10783,)
X_val_.shape=(10783, 676)
y_val_.shape=(10783,)
X_train_.shape=(21566, 676)
y_train_.shape=(21566,)
X_val_.shape=(10783, 676)
y_val_.shape=(10783,)


In [12]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    # hyperparams = {
    #     "metric": 'mae',
    #     "verbose": -1,
    #     "num_leaves": trial.suggest_int("num_leaves", 2, 256),
    #     "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
    #     "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
    #     "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    # }


    hyperparams = {
    "iterations": trial.suggest_int("iterations", 100, 1000),
    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
    "depth": trial.suggest_int("depth", 4, 10),
    "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
    "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
    "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
    "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
    "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
    "od_wait": trial.suggest_int("od_wait", 10, 50),
    "verbose": False
    }

       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [13]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2023-07-17 17:16:35,954] A new study created in memory with name: no-name-0b4e0401-25ac-444a-be79-265f39782d3b
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['average_rides_last_4_weeks'] = 0.25*(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['average_rides_last_4_weeks'] = 0.25*(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['averag

In [14]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'iterations': 963, 'learning_rate': 0.010697769843476264, 'depth': 7, 'l2_leaf_reg': 1.3640738607057397, 'bootstrap_type': 'Bayesian', 'random_strength': 2.822651978077099, 'bagging_temperature': 3.9267340275139717, 'od_type': 'IncToDec', 'od_wait': 34}


In [15]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0:	learn: 35.7773414	total: 36.6ms	remaining: 35.2s
1:	learn: 35.4613916	total: 66.7ms	remaining: 32s
2:	learn: 35.1355929	total: 94.4ms	remaining: 30.2s
3:	learn: 34.8138326	total: 123ms	remaining: 29.5s
4:	learn: 34.4981898	total: 152ms	remaining: 29.2s
5:	learn: 34.1842769	total: 181ms	remaining: 28.9s
6:	learn: 33.8718758	total: 209ms	remaining: 28.5s
7:	learn: 33.5629158	total: 238ms	remaining: 28.4s
8:	learn: 33.2597932	total: 268ms	remaining: 28.4s
9:	learn: 32.9690913	total: 297ms	remaining: 28.3s
10:	learn: 32.6711477	total: 325ms	remaining: 28.2s
11:	learn: 32.3851263	total: 355ms	remaining: 28.1s
12:	learn: 32.0870619	total: 385ms	remaining: 28.1s
13:	learn: 31.8037871	total: 414ms	remaining: 28.1s
14:	learn: 31.5208661	total: 444ms	remaining: 28s
15:	learn: 31.2426792	total: 475ms	remaining: 28.1s
16:	learn: 30.9655582	total: 505ms	remaining: 28.1s
17:	learn: 30.6986093	total: 534ms	remaining: 28s
18:	learn: 30.4211846	total: 564ms	remaining: 28s
19:	learn: 30.1654390	total

In [16]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.9082


In [21]:
from plot import plot_one_sample

plot_one_sample(
    example_id=8,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

