In [14]:
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error

We use the same data here that was used to train the LightGBM model

In [5]:
df = pd.read_parquet('../data/processed_data/yellow_23-24_data.parquet')
df.head()

Unnamed: 0,pickup_count,lon,lat,time_sin,time_cos,day_sin,day_cos
0,1.386294,-74.174002,40.69183,0.0,1.0,-0.974928,-0.222521
1,1.609438,-74.174002,40.69183,0.258819,0.965926,-0.974928,-0.222521
2,2.079442,-74.174002,40.69183,0.5,0.866025,-0.974928,-0.222521
3,2.197225,-74.174002,40.69183,0.707107,0.707107,-0.974928,-0.222521
4,2.772589,-74.174002,40.69183,0.866025,0.5,-0.974928,-0.222521


In [7]:
y = df['pickup_count']
X = df.drop(columns=['pickup_count'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }
    
    rf = RandomForestRegressor(**params, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(rf, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    
    return -np.mean(scores)

study = optuna.create_study(direction='minimize')  # We want to minimize MSE
study.optimize(objective, n_trials=100)

best_params = study.best_params
print("Best parameters:", best_params)

best_rf = RandomForestRegressor(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

[I 2024-12-01 09:35:37,859] A new study created in memory with name: no-name-bfe66ce9-09ac-4281-bcda-8967be012e2f
[I 2024-12-01 09:35:52,159] Trial 0 finished with value: 0.3343541085825457 and parameters: {'n_estimators': 225, 'max_depth': 13, 'min_samples_split': 34, 'min_samples_leaf': 5, 'max_features': 0.5569736706034885}. Best is trial 0 with value: 0.3343541085825457.
[I 2024-12-01 09:36:08,059] Trial 1 finished with value: 2.9892037129962548 and parameters: {'n_estimators': 915, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 6, 'max_features': 0.2784405392295266}. Best is trial 0 with value: 0.3343541085825457.
[I 2024-12-01 09:36:28,935] Trial 2 finished with value: 0.8370329062175393 and parameters: {'n_estimators': 486, 'max_depth': 7, 'min_samples_split': 39, 'min_samples_leaf': 7, 'max_features': 0.6307665333555101}. Best is trial 0 with value: 0.3343541085825457.
[I 2024-12-01 09:37:18,194] Trial 3 finished with value: 0.33494497798054335 and parameters: {'n

Best parameters: {'n_estimators': 570, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.8028064306711238}


In [16]:
y_hat = best_rf.predict(X_test)
r2 = r2_score(y_test, y_hat)
mae = mean_absolute_error(y_test, y_hat)

print(f"R^2 Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")

R^2 Score: 0.9782
Mean Absolute Error: 0.2650
