In [17]:
# Importing necessary libraries
import pandas as pd

# Reading the CSV file
file_path = "../master_final.csv"
data = pd.read_csv(file_path)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Extracting the hour of the day by splitting the arrival_time_new string
data['hour_of_day'] = data['arrival_time_new'].apply(lambda x: int(x.split()[2].split(':')[0])) 

# Define the features and target variables
selected_features = ['stop_sequence', 'day_of_the_week', 'time_category', 'dist_to_next_stop', 'hour_of_day']
selected_targets = ['next_stop_sequence', 'time_diff']

# Prepare the input and output arrays
X = data[selected_features]
y = data[selected_targets]

# One-hot encode the categorical variables
categorical_features = ['day_of_the_week', 'time_category']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                  categorical_features)], 
                                remainder="passthrough")
X_transformed = transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.metrics import mean_absolute_error

# Creating the multi-output linear regressor
model = MultiOutputRegressor(LinearRegression())

# Training the model on the training data
model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = model.predict(X_test)

# Evaluating the model using Mean Absolute Error (MAE)
mae_next_stop_sequence = mean_absolute_error(y_test['next_stop_sequence'], y_pred[:, 0])
mae_time_diff = mean_absolute_error(y_test['time_diff'], y_pred[:, 1])

mae_next_stop_sequence, mae_time_diff

(0.12731918017303442, 22.35253104323526)

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Calculating RMSE for both targets
rmse_next_stop_sequence = np.sqrt(mean_squared_error(y_test['next_stop_sequence'], y_pred[:, 0]))
rmse_time_diff = np.sqrt(mean_squared_error(y_test['time_diff'], y_pred[:, 1]))

# Calculating R² score for both targets
r2_next_stop_sequence = r2_score(y_test['next_stop_sequence'], y_pred[:, 0])
r2_time_diff = r2_score(y_test['time_diff'], y_pred[:, 1])

rmse_next_stop_sequence, rmse_time_diff

(0.20898485368894096, 30.069734190382803)

In [21]:
r2_next_stop_sequence, r2_time_diff

(0.9999013473948354, 0.5979735010519511)