In [1]:
# Importing necessary libraries
import pandas as pd

# Reading the CSV file
file_path = "../master_final.csv"
data = pd.read_csv(file_path)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Extracting the hour of the day by splitting the arrival_time_new string
data['hour_of_day'] = data['arrival_time_new'].apply(lambda x: int(x.split()[2].split(':')[0])) 

# Define the features and target variables
selected_features = ['stop_sequence', 'day_of_the_week', 'time_category', 'dist_to_next_stop', 'hour_of_day']
selected_targets = ['next_stop_sequence', 'time_diff']

# Prepare the input and output arrays
X = data[selected_features]
y = data[selected_targets]

# One-hot encode the categorical variables
categorical_features = ['day_of_the_week', 'time_category']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                  categorical_features)], 
                                remainder="passthrough")
X_transformed = transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Creating the k-NN regressor with k=5 (default value)
knn_model = MultiOutputRegressor(KNeighborsRegressor())

# Training the k-NN model on the training data
knn_model.fit(X_train, y_train)

# Predicting on the testing set
knn_y_pred = knn_model.predict(X_test)

# Evaluating the k-NN model using MAE, RMSE, and R² score
knn_mae_next_stop_sequence = mean_absolute_error(y_test['next_stop_sequence'], knn_y_pred[:, 0])
knn_rmse_next_stop_sequence = np.sqrt(mean_squared_error(y_test['next_stop_sequence'], knn_y_pred[:, 0]))
knn_r2_next_stop_sequence = r2_score(y_test['next_stop_sequence'], knn_y_pred[:, 0])

knn_mae_time_diff = mean_absolute_error(y_test['time_diff'], knn_y_pred[:, 1])
knn_rmse_time_diff = np.sqrt(mean_squared_error(y_test['time_diff'], knn_y_pred[:, 1]))
knn_r2_time_diff = r2_score(y_test['time_diff'], knn_y_pred[:, 1])

(knn_mae_next_stop_sequence, knn_rmse_next_stop_sequence, knn_r2_next_stop_sequence), (knn_mae_time_diff, knn_rmse_time_diff, knn_r2_time_diff)

((0.055214309821127225, 0.4490061023182213, 0.9995446100706669),
 (4.2589942625717185, 8.74415517628079, 0.966003734364628))