In [6]:
import pandas as pd
import numpy as np
from paths import CLEANED_DATA_DIR
from datetime import datetime
from data_split import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import catboost

In [14]:
def get_coordinates(df: pd.DataFrame):
    df = df[['the_geom', 'zone', 'LocationID']]
    df = df.rename(columns={'LocationID': 'pickup_location_id'})
    lat_list = []
    long_list = []

    for i in range(0,263):
        l = df['the_geom'][i][16:].replace(',','').replace('(','').replace(')','').split()

        long = [float(l[j]) for j in range(len(l)) if j % 2 == 0]
        lat = [float(l[j]) for j in range(len(l)) if j % 2 != 0]
        mean_lat = np.mean([max(lat),min(lat)])
        mean_long = np.mean([max(long),min(long)])

        lat_list.append(mean_lat)
        long_list.append(mean_long)

    coords = pd.DataFrame(list(zip(lat_list, long_list)),columns=['Latitude', 'Longitude'])

    return pd.concat([df[['pickup_location_id']], coords],axis=1)



coord_df = pd.read_csv(filepath_or_buffer='../data/taxi_zones.csv')
coord_df = get_coordinates(coord_df)

df = pd.read_parquet(CLEANED_DATA_DIR / 'tabular_data.parquet')
df = df.merge(coord_df, on='pickup_location_id', how='left')
df.drop(columns=['pickup_location_id'],inplace=True)

In [15]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 11, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(72588, 675)
y_train.shape=(72588,)
X_test.shape=(16043, 675)
y_test.shape=(16043,)


In [16]:
df.head(1)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,target_rides_next_hour,Latitude,Longitude
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,17.0,40.724137,-73.977725


In [17]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False)

In [18]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,Latitude,Longitude,average_rides_last_4_weeks
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,40.724137,-73.977725,20.25
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,40.724137,-73.977725,17.50
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,40.724137,-73.977725,0.25
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,40.724137,-73.977725,0.75
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,40.724137,-73.977725,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-27,40.792054,-73.881804,0.00
72584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-28,40.792054,-73.881804,0.00
72585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-29,40.792054,-73.881804,0.00
72586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-30,40.792054,-73.881804,0.00


In [19]:
class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Generate numeric columns from datetime
        X_["hour"] = X_['pickup_hour'].dt.hour
        X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek
        
        return X_.drop(columns=['pickup_hour'])

In [20]:
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,Latitude,Longitude,average_rides_last_4_weeks,hour,day_of_week
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,4.0,3.0,4.0,9.0,19.0,40.724137,-73.977725,20.25,0,5
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,10.0,7.0,5.0,9.0,10.0,40.724137,-73.977725,17.50,0,6
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,5.0,5.0,10.0,0.0,40.724137,-73.977725,0.25,0,0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,7.0,1.0,0.0,1.0,3.0,40.724137,-73.977725,0.75,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,0.0,4.0,4.0,3.0,40.724137,-73.977725,0.75,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.792054,-73.881804,0.00,0,3
72584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.792054,-73.881804,0.00,0,4
72585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.792054,-73.881804,0.00,0,5
72586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.792054,-73.881804,0.00,0,6


In [21]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    catboost.CatBoostRegressor()
)
pipeline.fit(X_train, y_train)

Learning rate set to 0.080574
0:	learn: 33.7896202	total: 86ms	remaining: 1m 25s
1:	learn: 31.4181365	total: 111ms	remaining: 55.5s
2:	learn: 29.2690876	total: 136ms	remaining: 45.3s
3:	learn: 27.2531920	total: 161ms	remaining: 40.1s
4:	learn: 25.4022307	total: 186ms	remaining: 37.1s
5:	learn: 23.7240873	total: 217ms	remaining: 36s
6:	learn: 22.1894173	total: 251ms	remaining: 35.7s
7:	learn: 20.7988963	total: 279ms	remaining: 34.6s
8:	learn: 19.5054972	total: 306ms	remaining: 33.7s
9:	learn: 18.3276165	total: 337ms	remaining: 33.4s
10:	learn: 17.2605732	total: 378ms	remaining: 33.9s
11:	learn: 16.2794569	total: 413ms	remaining: 34s
12:	learn: 15.3749828	total: 445ms	remaining: 33.8s
13:	learn: 14.5620027	total: 474ms	remaining: 33.4s
14:	learn: 13.7928635	total: 499ms	remaining: 32.8s
15:	learn: 13.1195461	total: 525ms	remaining: 32.3s
16:	learn: 12.4973997	total: 552ms	remaining: 31.9s
17:	learn: 11.9531766	total: 578ms	remaining: 31.6s
18:	learn: 11.4550482	total: 602ms	remaining: 31

In [22]:
predictions = pipeline.predict(X_test)

from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5591
