# LightGBM model with feature engineering

In [1]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0 ,0 ,0),
    target_column_name='target_rides_next_hour'
)

# print the shapes of the train and test sets with names
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (32226, 674)
y_train shape: (32226,)
X_test shape: (56068, 674)
y_test shape: (56068,)


## Feature engineering

In [7]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    :param X: DataFrame with the features
    :return: DataFrame with the new feature
    """
    X['average_rides_last_4_weeks'] = (
        X[f'rides_previous_{7*24}_hour'] +
        X[f'rides_previous_{2*7*24}_hour'] +
        X[f'rides_previous_{3*7*24}_hour'] +
        X[f'rides_previous_{4*7*24}_hour']
    ) / 4
    return X

In [8]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks,
    validate=False
)

In [10]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train).head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,20.25
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,17.5
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,0.25
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,0.75
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,0.75


Add custom Scikit-Learn transformer to generate numeric columns from datetime.

If month is added, the model performs worse.

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineering(BaseEstimator, TransformerMixin):
    """Adds numeric columns from pickup_hour column."""
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        
        # Generate numeric columns from datetime
        X_['hour'] = X_['pickup_hour'].dt.hour
        X_['day_of_week'] = X_['pickup_hour'].dt.dayofweek
        #X_['month'] = X_['pickup_hour'].dt.month
        
        return X_.drop(columns=['pickup_hour'])

In [22]:
temporal_feature_engineering = TemporalFeatureEngineering()
temporal_feature_engineering.fit_transform(X_train).head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id,average_rides_last_4_weeks,hour,day_of_week
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,7.0,4.0,3.0,4.0,9.0,19.0,4,20.25,0,5
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,4.0,10.0,7.0,5.0,9.0,10.0,4,17.5,0,6
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,7.0,8.0,5.0,5.0,10.0,0.0,4,0.25,0,0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,16.0,7.0,1.0,0.0,1.0,3.0,4,0.75,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,8.0,3.0,0.0,4.0,4.0,3.0,4,0.75,0,2


## Make a pipeline

In [13]:
import lightgbm as lgb
from sklearn.pipeline import make_pipeline

In [23]:
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    temporal_feature_engineering,
    lgb.LGBMRegressor()
)

pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32226, number of used features: 675
[LightGBM] [Info] Start training from score 11.703562


In [24]:
from sklearn.metrics import mean_absolute_error

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae:.4f}')

MAE: 2.5975


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
