In [None]:
# data handling
import numpy as np
import pandas as pd

# data visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# feature selection
from sklearn.feature_selection import RFE

# machine learning algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier

# Import label encoder
from sklearn.preprocessing import LabelEncoder

# dimensionality reduction with PCA
from sklearn.decomposition import PCA

# accuracy metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, accuracy_score

# date and time handling
import datetime

## Opening the Data

In [None]:
bus_stop_data = pd.read_csv("data/bus_stop_times_feature_added_all.csv")
bus_stop_data

Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time,dwell_time_in_seconds_old,day_of_week,...,hour_of_day,day,month,temp,precip,windspeed,conditions,rt(n-1),stop_type,dwell_time_in_seconds
0,1,262,1,101,2021-10-01,06:40:58,06:42:12,0:01:14,74.0,4,...,6,1,10,20.0,0.0,6.1,Partially cloudy,69.0,pro,74.0
1,1,262,1,102,2021-10-01,06:45:42,06:45:42,0:00:00,0.0,4,...,6,1,10,20.0,0.0,6.1,Partially cloudy,210.0,mod,0.0
2,1,262,1,103,2021-10-01,06:53:58,06:54:04,0:00:06,6.0,4,...,6,1,10,20.0,0.0,6.1,Partially cloudy,496.0,mod,6.0
3,1,262,1,104,2021-10-01,06:57:19,06:57:19,0:00:00,0.0,4,...,6,1,10,20.0,0.0,6.1,Partially cloudy,195.0,br,0.0
4,1,262,1,105,2021-10-01,06:58:56,07:02:27,0:03:31,211.0,4,...,6,1,10,20.0,0.0,6.1,Partially cloudy,97.0,pro,211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91678,25370,121,1,110,2022-11-01,18:47:38,18:47:53,0:00:15,15.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,134.0,mod,15.0
91679,25370,121,1,111,2022-11-01,18:50:08,18:50:15,0:00:07,7.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,135.0,br,7.0
91680,25370,121,1,112,2022-11-01,18:52:39,18:52:50,0:00:11,11.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,144.0,br,11.0
91681,25370,121,1,113,2022-11-01,18:55:01,18:55:14,0:00:13,13.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,131.0,mod,13.0


In [None]:
bus_stop_data.columns

Index(['trip_id', 'deviceid', 'direction', 'bus_stop', 'date', 'arrival_time',
       'departure_time', 'dwell_time', 'dwell_time_in_seconds_old',
       'day_of_week', 'time_of_day', 'Sunday/holiday', 'saturday',
       'weekday/end', 'week_no', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'dt(t-1)',
       'dt(t-2)', 'dt(n-1)', 'dt(n-2)', 'dt(n-3)', 'hour_of_day', 'day',
       'month', 'temp', 'precip', 'windspeed', 'conditions', 'rt(n-1)',
       'stop_type', 'dwell_time_in_seconds'],
      dtype='object')

## Handling Missing Values

In [None]:
bus_stop_data.isnull().sum()

trip_id                      0
deviceid                     0
direction                    0
bus_stop                     0
date                         0
arrival_time                 0
departure_time               0
dwell_time                   0
dwell_time_in_seconds_old    0
day_of_week                  0
time_of_day                  0
Sunday/holiday               0
saturday                     0
weekday/end                  0
week_no                      0
dt(w-1)                      0
dt(w-2)                      0
dt(w-3)                      0
dt(t-1)                      0
dt(t-2)                      0
dt(n-1)                      0
dt(n-2)                      0
dt(n-3)                      0
hour_of_day                  0
day                          0
month                        0
temp                         0
precip                       0
windspeed                    0
conditions                   0
rt(n-1)                      0
stop_type                    0
dwell_ti

## Understanding the Data

bus_run_data.loc['date'] = pd.to_datetime(bus_run_data.date)
bus_run_data

In [None]:
bus_stop_data.groupby(['bus_stop','time_of_day']).dwell_time_in_seconds.mean().unstack()

time_of_day,6.00,6.25,6.50,6.75,7.00,7.25,7.50,7.75,8.00,8.25,...,16.50,16.75,17.00,17.25,17.50,17.75,18.00,18.25,18.50,18.75
bus_stop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,70.964912,62.709677,79.886256,84.243421,65.480519,70.386792,57.338983,62.912088,62.507692,52.460674,...,57.226994,48.573034,54.234899,58.137931,61.479339,59.27,57.430894,52.176471,61.136986,51.216867
102,19.0,25.422222,31.654545,43.086364,61.616216,42.392857,42.401786,25.913295,24.793103,14.431655,...,17.103448,18.766234,21.660194,15.0,12.322917,8.464912,7.154545,10.35,10.387755,7.538462
103,,28.206897,27.225225,28.833333,27.715026,29.655914,18.278481,19.566667,19.152174,16.663158,...,14.414141,22.030075,19.406015,23.219048,21.464,19.045455,19.411765,18.82243,15.351648,15.958333
104,,1.061224,2.521277,1.3,1.080925,0.963303,2.179856,1.660819,1.290909,4.683761,...,3.833333,4.103175,4.963964,3.617021,5.019231,5.63,7.736842,7.206612,7.773333,6.939759
105,,188.65,182.542373,176.3,129.159091,121.401826,108.89172,146.071429,132.676471,149.462963,...,79.890625,74.505155,69.042254,55.681416,60.143939,45.173469,41.556522,38.913043,24.481928,24.816092
106,,7.5,25.626866,23.218045,21.327044,22.910377,24.994413,22.898734,22.90625,27.735714,...,34.275591,39.889908,44.512397,44.740741,44.354545,53.95,39.557895,44.233871,39.48913,35.918367
107,,7.5,9.793651,14.322034,14.065089,16.07109,17.371429,15.431507,14.657343,14.561151,...,16.674419,16.752294,17.975806,16.84127,15.345455,13.95935,15.456522,14.18254,12.614583,10.478723
108,,0.0,17.964912,19.166667,18.520231,17.940594,16.171123,15.725926,15.044025,13.338583,...,15.555556,14.663551,15.607692,15.416,18.412844,18.460317,18.819277,21.395349,18.163462,16.06383
109,,0.0,57.150943,70.382353,59.729885,56.121053,48.678392,56.395683,46.067485,54.252033,...,46.193798,43.552381,46.619403,44.336066,36.701754,44.01626,32.908046,30.382114,28.40708,25.795455
110,,0.0,26.137931,27.777778,28.118056,30.061111,26.308824,26.025,28.606061,27.393701,...,43.008929,27.768595,22.121951,24.243697,25.257812,23.683761,20.893204,19.229508,18.946903,17.376471


In [None]:
bus_stop_data.groupby('stop_type').dwell_time_in_seconds_old.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stop_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
br,38935.0,14.423064,18.176034,0.0,0.0,15.0,22.0,587.0
mod,32886.0,28.362677,40.922716,0.0,9.0,15.0,33.0,596.0
pro,19862.0,77.45791,79.735974,0.0,20.0,49.0,103.0,564.0


In [None]:
bus_stop_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
trip_id,91683.0,10913.213955,8623.14843,1.0,4022.5,7560.0,21861.0,25370.0
deviceid,91683.0,549.955619,511.6651,116.0,128.0,274.0,1143.0,1719.0
direction,91683.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
bus_stop,91683.0,107.45509,4.034722,101.0,104.0,107.0,111.0,114.0
dwell_time_in_seconds_old,91683.0,33.078826,51.968448,0.0,6.0,15.0,35.0,596.0
day_of_week,91683.0,2.824602,1.94996,0.0,1.0,3.0,4.0,6.0
time_of_day,91683.0,12.444927,3.532391,6.0,9.5,12.5,15.5,18.75
Sunday/holiday,91683.0,0.148381,0.355479,0.0,0.0,0.0,0.0,1.0
saturday,91683.0,0.142884,0.349956,0.0,0.0,0.0,0.0,1.0
weekday/end,91683.0,0.750608,0.432663,0.0,1.0,1.0,1.0,1.0


## Encoding Bus Stop Type

In [None]:
bus_stop_data.groupby('bus_stop').stop_type.unique()

bus_stop
101    [pro]
102    [mod]
103    [mod]
104     [br]
105    [pro]
106    [mod]
107     [br]
108     [br]
109    [pro]
110    [mod]
111     [br]
112     [br]
113    [mod]
114     [br]
Name: stop_type, dtype: object

In [None]:
label_encoder = LabelEncoder()
bus_stop_data['stop_type'] = label_encoder.fit_transform(bus_stop_data['stop_type'])

In [None]:
bus_stop_data.groupby('bus_stop').stop_type.unique()

bus_stop
101    [2]
102    [1]
103    [1]
104    [0]
105    [2]
106    [1]
107    [0]
108    [0]
109    [2]
110    [1]
111    [0]
112    [0]
113    [1]
114    [0]
Name: stop_type, dtype: object

## Separating into Training and Test Data

In [None]:
date = '2022-10-15'
train = bus_stop_data.loc[bus_stop_data.date < date]
test = bus_stop_data.loc[bus_stop_data.date >= date]

In [None]:
np.shape(train)[0]

84395

In [None]:
np.shape(test)[0]

7288

In [None]:
print("Portion taken for training = ",round(np.shape(train)[0]/ np.shape(bus_stop_data)[0] * 100, 2),"%")

Portion taken for training =  92.05 %


In [None]:
test

Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time,dwell_time_in_seconds_old,day_of_week,...,hour_of_day,day,month,temp,precip,windspeed,conditions,rt(n-1),stop_type,dwell_time_in_seconds
84395,24261,505,1,101,2022-10-15,07:02:25,07:03:19,0:00:54,54.0,5,...,7,15,10,20.0,0.0,6.1,Overcast,96.0,2,54.0
84396,24261,505,1,102,2022-10-15,07:05:19,07:05:19,0:00:00,0.0,5,...,7,15,10,20.0,0.0,6.1,Overcast,120.0,1,0.0
84397,24261,505,1,103,2022-10-15,07:11:13,07:11:13,0:00:00,0.0,5,...,7,15,10,20.0,0.0,6.1,Overcast,354.0,1,0.0
84398,24261,505,1,104,2022-10-15,07:14:55,07:14:55,0:00:00,0.0,5,...,7,15,10,20.0,0.0,6.1,Overcast,222.0,0,0.0
84399,24261,505,1,105,2022-10-15,07:16:59,07:17:14,0:00:15,15.0,5,...,7,15,10,20.0,0.0,6.1,Overcast,124.0,2,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91678,25370,121,1,110,2022-11-01,18:47:38,18:47:53,0:00:15,15.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,134.0,1,15.0
91679,25370,121,1,111,2022-11-01,18:50:08,18:50:15,0:00:07,7.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,135.0,0,7.0
91680,25370,121,1,112,2022-11-01,18:52:39,18:52:50,0:00:11,11.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,144.0,0,11.0
91681,25370,121,1,113,2022-11-01,18:55:01,18:55:14,0:00:13,13.0,1,...,18,1,11,20.5,0.0,8.3,Overcast,131.0,1,13.0


In [None]:
bus_stop_data.columns

Index(['trip_id', 'deviceid', 'direction', 'bus_stop', 'date', 'arrival_time',
       'departure_time', 'dwell_time', 'dwell_time_in_seconds_old',
       'day_of_week', 'time_of_day', 'Sunday/holiday', 'saturday',
       'weekday/end', 'week_no', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'dt(t-1)',
       'dt(t-2)', 'dt(n-1)', 'dt(n-2)', 'dt(n-3)', 'hour_of_day', 'day',
       'month', 'temp', 'precip', 'windspeed', 'conditions', 'rt(n-1)',
       'stop_type', 'dwell_time_in_seconds'],
      dtype='object')

In [None]:
# list of columns used for variables
list_of_col = ['deviceid','bus_stop','direction','stop_type',
               'day_of_week','month','day',
               'time_of_day',
               'dt(w-1)','dt(w-2)','dt(w-3)','dt(t-1)','dt(t-2)','dt(n-1)','dt(n-2)','dt(n-3)','rt(n-1)',
               'precip','windspeed']

In [None]:
len(list_of_col)

19

In [None]:
# Training Data
# Split data into features and target
train_X = train[list_of_col]
train_y = train['dwell_time_in_seconds']

In [None]:
# Testing Data
# Split data into features and target
test_X = test[list_of_col]
test_y = test['dwell_time_in_seconds']

## Evaluating Errors

In [None]:
def print_error_metrics_pred_vs_actual(pred_y, actual_y):
    print("MAE  : ", mean_absolute_error(pred_y, actual_y))
    print("RMSE : ", (mean_squared_error(pred_y, actual_y))**0.5)
    print("MAPE : ", mean_absolute_percentage_error(pred_y, actual_y)*10 ,"%")

## Designing the Model

In [None]:
train_X

Unnamed: 0,deviceid,bus_stop,direction,stop_type,day_of_week,month,day,time_of_day,dt(w-1),dt(w-2),dt(w-3),dt(t-1),dt(t-2),dt(n-1),dt(n-2),dt(n-3),rt(n-1),precip,windspeed
0,262,101,1,2,4,10,1,6.50,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,69.0,0.0,6.1
1,262,102,1,1,4,10,1,6.75,45.0,45.0,45.0,45.0,45.0,74.0,45.0,45.0,210.0,0.0,6.1
2,262,103,1,1,4,10,1,6.75,28.0,28.0,28.0,28.0,28.0,0.0,74.0,28.0,496.0,0.0,6.1
3,262,104,1,0,4,10,1,6.75,1.0,1.0,1.0,1.0,1.0,6.0,0.0,74.0,195.0,0.0,6.1
4,262,105,1,2,4,10,1,6.75,230.0,230.0,230.0,230.0,230.0,0.0,6.0,0.0,97.0,0.0,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84390,1358,111,1,0,4,10,14,18.75,9.0,9.0,9.0,9.0,15.0,15.0,15.0,15.0,135.0,0.0,7.9
84391,1358,112,1,0,4,10,14,18.75,6.0,6.0,6.0,6.0,23.0,0.0,15.0,15.0,168.0,0.0,7.9
84392,128,101,1,2,4,10,14,18.75,38.0,38.0,0.0,38.0,38.0,38.0,38.0,38.0,95.0,0.0,7.9
84393,128,102,1,1,4,10,14,18.75,4.0,4.0,4.0,4.0,15.0,30.0,4.0,4.0,194.0,0.0,7.9


### Classifier Model for Zero Dwell Times

In [None]:
cl_train_y = np.where(train_y > 0 , 1, 0)
cl_train_y

array([1, 0, 1, ..., 1, 0, 1])

In [None]:
print("Percentage of Dwell times > 0 in Training : ", round(cl_train_y.sum() / len(cl_train_y) * 100, 2),"%")

Percentage of Dwell times > 0 in Training :  77.38 %


In [None]:
classifier = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    objective='reg:squarederror',
    random_state=42)

classifier.fit(train_X, np.where(train_y > 0 , 1, 0))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='reg:squarederror', predictor=None, ...)

### Training Error for Classifier Model

In [None]:
cl_train_y_pred = classifier.predict(train_X)
cl_train_y_pred

array([1, 1, 1, ..., 1, 0, 1])

In [None]:
train_predictions = pd.DataFrame(cl_train_y_pred, index=train_X.index)

In [None]:
accuracy_score(cl_train_y_pred, cl_train_y) *100

86.08685348658096

### Validation Error for Classifier Model

In [None]:
cl_test_y = np.where(test_y > 0 , 1, 0)
cl_test_y

array([1, 0, 0, ..., 1, 1, 1])

In [None]:
cl_test_y_pred = classifier.predict(test_X)
cl_test_y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
test_predictions = pd.DataFrame(cl_test_y_pred, index=test_X.index)

In [None]:
accuracy_score(cl_test_y_pred, cl_test_y) *100

85.78485181119649

### Regressor Model for Dwell Times predicted to be greater than 0

In [None]:
train_X_reg = train_X.loc[cl_train_y_pred == 1]
train_y_reg = train_y.loc[cl_train_y_pred == 1]

test_X_reg = test_X.loc[cl_test_y_pred == 1]
test_y_reg = test_y.loc[cl_test_y_pred == 1]

In [None]:
# Train a model to predict running times for the next segment
reg_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    objective='reg:squarederror',
    random_state=42
)

reg_model.fit(train_X_reg, train_y_reg)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)

### Training Error for Regressor

In [None]:
train_pred_y_reg = reg_model.predict(train_X_reg)

In [None]:
train_predictions.loc[train_predictions[0] == 1] = train_pred_y_reg

In [None]:
print("Training Error")
print_error_metrics_pred_vs_actual(train_pred_y_reg, train_y_reg)

Training Error
MAE  :  22.662035058222145
RMSE :  40.2187260156503
MAPE :  5.795473039399349 %


### Validation Error

In [None]:
test_pred_y_reg = reg_model.predict(test_X_reg)

In [None]:
test_predictions.loc[test_predictions[0] == 1]  = test_pred_y_reg

In [None]:
print("Validation Error")
print_error_metrics_pred_vs_actual(test_pred_y_reg, test_y_reg)

Validation Error
MAE  :  16.469821079355118
RMSE :  31.34340111824969
MAPE :  5.694334853552694 %


## Overall Error

In [None]:
print("Overall Training Error")
print_error_metrics_pred_vs_actual(train_predictions[0], train_y)

Overall Training Error
MAE  :  19.84370593838034
RMSE :  37.35672750738981
MAPE :  3.1773070136058616e+16 %


In [None]:
print("Overall Testing Error")
print_error_metrics_pred_vs_actual(test_predictions[0], test_y)

Overall Testing Error
MAE  :  14.50275362578234
RMSE :  29.010769503939564
MAPE :  3.791724384405237e+16 %


## Performing Mulit-Step Predictions

### Helper functions

In [None]:
def searching_historical_avg_time(dataset, time_of_day, bus_stop):
    data = pd.DataFrame(dataset.groupby(['time_of_day','bus_stop']).dwell_time_in_seconds.mean())
    return round(data.loc[(time_of_day, bus_stop)][0])

In [None]:
def searching_historical_weekly_avg_time(dataset, week_no, bus_stop, time_of_day):
    data = pd.DataFrame(dataset.groupby(['week_no','bus_stop','time_of_day']).dwell_time_in_seconds.mean())
    try:
        avg_time = data.loc[(week_no, bus_stop, time_of_day)][0]
    except KeyError:
        avg_time = search_nearby_historical(dataset, week_no, bus_stop, time_of_day)
    return round(avg_time)

In [None]:
def search_nearby_historical(dataset, week_no, bus_stop, time_of_day):
    nearby_timeslots = []
    time_slot = time_of_day
    while (time_slot - time_of_day <= 2):
        nearby_timeslots.append(time_slot - 0.25)
        nearby_timeslots.append(time_slot + 0.25)
        time_slot += 0.25

#     return nearby_timeslots
    for nearby_timeslot in nearby_timeslots:
        avg_time = searching_historical_weekly_avg_time(dataset, week_no, bus_stop, nearby_timeslot)
        if avg_time is not None:
            return avg_time

    return None
#     return nearby_timeslots

In [None]:
def search_stop_type(bus_stop):
    data = pd.DataFrame(bus_stop_data.groupby('bus_stop').stop_type.unique().apply(lambda x: x[0]))
    try:
        return data.loc[bus_stop][0]
    except KeyError:
        return None

In [None]:
# function to convert time_object into hours in integer form
def time_to_hour(time_obj):
    # the hour part as integer
    decimal_hour = time_obj.hour

    # adding the minute portion as the decimal portion
    if time_obj.minute < 15:
        decimal_hour += 0
    elif time_obj.minute < 30:
        decimal_hour += 0.25
    elif time_obj.minute < 45:
        decimal_hour += 0.5
    else:
        decimal_hour += 0.75
    return decimal_hour

In [None]:
# obtaining the last segment to calculate
def get_last_stop(bus_stop):
    if bus_stop <= 114:
        return 114
    else:
        return None

In [None]:
def apply_dwell_time_prediction(features):
    # applying classifier
    if classifier.predict(features)[0] == 1:
        return reg_model.predict(features)[0]
    else:
        return 0

In [None]:
# Function to predict dwell times for multiple subsequent bus_stops
def predict_multiple_stops(features):

    # list to save the predicted dwell times
    predicted_times = []

    # extracting the features from the data row
    max_stop = get_last_stop(features['bus_stop'])

    timeslot = features['time_of_day']  # Initialize previous hour
    curr_time = pd.to_datetime(features['arrival_time'])
    week_no = features['week_no']

    # forming the data to be sent for model prediction
    features = {key: features[key] for key in list_of_col}

    # apply prediction and save to list
    predicted_time = apply_dwell_time_prediction(pd.DataFrame(features, index={0}))
    predicted_time = max(0, predicted_time)
    predicted_times.append(round(predicted_time))

    while (features['bus_stop'] < max_stop):

        # update the next stop number
        features['bus_stop'] +=1

        # Update features for the next bus_stop
        features['dt(n-3)'] = features['dt(n-2)']
        features['dt(n-2)'] = features['dt(n-1)']
        features['dt(n-1)'] = predicted_time

        # update current time
        dwell_minutes = predicted_time // 60
        dwell_seconds = round(predicted_time % 60)
        curr_time = curr_time + datetime.timedelta(seconds = round(predicted_time))

        # update the timeslot for next bus_stop
        while (time_to_hour(curr_time) > timeslot):
            timeslot += 0.25
            features['time_of_day'] = (features['time_of_day'] + 0.25) % 24  # Increment hour

        # update dt(t-k) values
        features['dt(t-1)'] = searching_historical_avg_time(bus_stop_data, timeslot - 0.25, features['bus_stop'])
        features['dt(t-2)'] = searching_historical_avg_time(bus_stop_data, timeslot - 0.5, features['bus_stop'])

        # update dt(w-k) values
        if week_no > 3:
            features['dt(w-1)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 1, features['bus_stop'], timeslot)
            features['dt(w-2)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 2, features['bus_stop'], timeslot)
            features['dt(w-3)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 3, features['bus_stop'], timeslot)
        elif week_no > 2:
            features['dt(w-1)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 1, features['bus_stop'], timeslot)
            features['dt(w-2)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 2, features['bus_stop'], timeslot)
            features['dt(w-3)'] = features['dt(w-2)']
        elif week_no > 1:
            features['dt(w-1)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no - 1, features['bus_stop'], timeslot)
            features['dt(w-2)'] = features['dt(w-1)']
            features['dt(w-3)'] = features['dt(w-2)']
        else:
            features['dt(w-1)'] = searching_historical_weekly_avg_time(bus_stop_data, week_no, features['bus_stop'], timeslot)
            features['dt(w-2)'] = features['dt(w-1)']
            features['dt(w-3)'] = features['dt(w-2)']

        # update length of segment
        features['stop_type'] = search_stop_type(features['bus_stop'])

#         predicted_time = model.predict(pd.DataFrame(features, index={0}))[0]
        predicted_time = apply_dwell_time_prediction(pd.DataFrame(features, index={0}))
        predicted_time = max(0, predicted_time)
        predicted_times.append(round(predicted_time))

    return predicted_times

In [None]:
list_of_col.append('arrival_time')
list_of_col.append('week_no')

start_index = 89222
start_index = 993
start_index = 90004
curr_stop = bus_stop_data.loc[start_index, 'bus_stop']
max_stop = get_last_stop(curr_stop)
end_index = start_index + max_stop - curr_stop

initial_features = dict(bus_stop_data.loc[start_index,list_of_col])

list_of_col.remove('arrival_time')
list_of_col.remove('week_no')

# Predict running times for multiple segments
predicted_segment_times = predict_multiple_stops(initial_features)
print("Predicted Dwell Times for Subsequent Stops:", predicted_segment_times)
print('')

actual_segment_times = list(bus_stop_data.loc[start_index:end_index,'dwell_time_in_seconds'])
print("Actual Dwell Times for Subsequent Stop:", actual_segment_times)
print('')

print_error_metrics_pred_vs_actual(predicted_segment_times, actual_segment_times)
print('')

Predicted Dwell Times for Subsequent Stops: [61, 0, 0, 0, 105, 31, 18, 13, 122, 25, 0, 0, 0, 0]

Actual Dwell Times for Subsequent Stop: [60.0, 165.0, 0.0, 0.0, 0.0, 15.0, 0.0, 0.0, 60.0, 0.0, 0.0, 0.0, 30.0, 0.0]

MAE  :  31.071428571428573
RMSE :  56.29577502543407
MAPE :  6.272870909551762e+17 %

