In [1]:
# Importing necessary libraries
import pandas as pd

# Reading the CSV file
file_path = "../master_final.csv"
data = pd.read_csv(file_path)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Extracting the hour of the day by splitting the arrival_time_new string
data['hour_of_day'] = data['arrival_time_new'].apply(lambda x: int(x.split()[2].split(':')[0])) 

# Define the features and target variables
selected_features = ['stop_sequence', 'day_of_the_week', 'time_category', 'dist_to_next_stop', 'hour_of_day']
selected_targets = ['next_stop_sequence', 'time_diff']

# Prepare the input and output arrays
X = data[selected_features]
y = data[selected_targets]

# One-hot encode the categorical variables
categorical_features = ['day_of_the_week', 'time_category']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                  categorical_features)], 
                                remainder="passthrough")
X_transformed = transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestRegressor
# Creating the Random Forest regressor
rf_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Training the Random Forest model on the training data
rf_model.fit(X_train, y_train)

In [7]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid for Random Forest
param_grid = {
    'estimator__n_estimators': [200, 250, 275, 300, 325, 350, 400, 500, 600],
    'estimator__max_depth': [None, 1, 2, 5, 10, 20, 30],
    'estimator__min_samples_split': [1, 2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Creating the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5,
                           verbose=2, n_jobs=-1)

# Fitting the grid search to the data (this may take some time)
grid_search.fit(X_train, y_train)

# Retrieving the best hyperparameters
best_params = grid_search.best_params_
best_params

Fitting 5 folds for each of 756 candidates, totalling 3780 fits


{'estimator__max_depth': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 1,
 'estimator__n_estimators': 600}