In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# read the cleaned CSV
taxi_data = pd.read_csv('../data/yellow/taxi_y_cleaned.csv')

In [3]:
# convert pickup datetime to datetime object and extract the hour
taxi_data['tpep_pickup_datetime'] = pd.to_datetime(taxi_data['tpep_pickup_datetime'])
taxi_data['hour_of_day'] = taxi_data['tpep_pickup_datetime'].dt.hour
taxi_data['day_of_week'] = taxi_data['tpep_pickup_datetime'].dt.dayofweek
taxi_data['week_of_year'] = taxi_data['tpep_pickup_datetime'].dt.isocalendar().week
taxi_data['is_weekend'] = taxi_data['day_of_week'].isin([5, 6]).astype(int)

In [4]:
# aggregate data to count the number of rides per hour and day of the week
hourly_weekly_ride_counts = taxi_data.groupby(['day_of_week', 'hour_of_day']).size().reset_index(name='ride_count')

In [5]:
# prepare the features (X) and target (y)
X = hourly_weekly_ride_counts.drop('ride_count', axis=1)
y = hourly_weekly_ride_counts['ride_count']

In [6]:
# splits the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [7]:
# random forest
random_forest = RandomForestRegressor(random_state=33)

# Define the parameter grid for randomized search
param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# start search model cross validation
random_search = RandomizedSearchCV(estimator=random_forest, param_distributions=param_distributions, n_iter=108, cv=10, verbose=2, random_state=42, n_jobs=-1)

# fir data
random_search.fit(X_train, y_train)

# display best params
display(f"Best parameters: {random_search.best_params_}")
best_rf_model = random_search.best_estimator_

# Save the best model from randomized search
joblib.dump(best_rf_model, 'r../models/random_forest_model_improved.pkl')

# Evaluate the best model on the test set
from sklearn.metrics import mean_squared_error

y_pred = best_rf_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
display(f"Test set RMSE: {rmse}")


Fitting 10 folds for each of 108 candidates, totalling 1080 fits


"Best parameters: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30}"

'Test set RMSE: 47.55739591920914'

we will use VIF to indentify if there is multicolinearity in the numerical columns as categorical are so numerous

total_amount and fare_amount have high VIFs which indicate multicolinearity

RMSE of 1.71 is 'fair' in the context of tips here. 0.6 r2 seems reasonable showing the the model accounts for 60% of the variance in the tip amount

As we might expect there is high colinearity between fare_amount, tolls amount, and total_amount. We will drop total_amount since it is a derivation of these other columns in this case for the LR model.