In [None]:
# data handling
import numpy as np
import pandas as pd

# data visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# feature selection
from sklearn.feature_selection import RFE

# machine learning algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier

# dimensionality reduction with PCA
from sklearn.decomposition import PCA

# accuracy metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# date and time handling
import datetime

## Opening the Data

In [None]:
bus_run_data = pd.read_csv("data/bus_running_times_feature_added_all.csv")
bus_run_data

Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,rt(n-2),rt(n-3),hour_of_day,day,month,temp,precip,windspeed,conditions,dt(n-1)
0,1.0,262.0,1.0,1.0,2021-10-01,06:39:49,06:40:58,0 days 00:01:09,69.0,0.6261,...,96.0,96.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,0.0
1,1.0,262.0,1.0,2.0,2021-10-01,06:42:12,06:45:42,0 days 00:03:30,210.0,1.2808,...,247.0,247.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,74.0
2,1.0,262.0,1.0,3.0,2021-10-01,06:45:42,06:53:58,0 days 00:08:16,496.0,2.1125,...,69.0,506.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,0.0
3,1.0,262.0,1.0,4.0,2021-10-01,06:54:04,06:57:19,0 days 00:03:15,195.0,1.5513,...,210.0,69.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,6.0
4,1.0,262.0,1.0,5.0,2021-10-01,06:57:19,06:58:56,0 days 00:01:37,97.0,0.8450,...,496.0,210.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203123,25367.0,1377.0,2.0,30.0,2022-11-01,18:17:36,18:24:21,0:06:45,405.0,2.5600,...,150.0,105.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0
203124,25367.0,1377.0,2.0,31.0,2022-11-01,18:24:21,18:25:36,0:01:15,75.0,0.4200,...,195.0,150.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0
203125,25367.0,1377.0,2.0,32.0,2022-11-01,18:25:49,18:28:10,0:02:21,141.0,1.3000,...,405.0,195.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,13.0
203126,25367.0,1377.0,2.0,33.0,2022-11-01,18:28:10,18:31:25,0:03:15,195.0,1.2200,...,75.0,405.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0


In [None]:
bus_run_data.columns

Index(['trip_id', 'deviceid', 'direction', 'segment', 'date', 'start_time',
       'end_time', 'run_time', 'run_time_in_seconds', 'length', 'day_of_week',
       'time_of_day', 'Sunday/holiday', 'saturday', 'weekday/end', 'week_no',
       'rt(w-1)', 'rt(w-2)', 'rt(w-3)', 'rt(t-1)', 'rt(t-2)', 'rt(n-1)',
       'rt(n-2)', 'rt(n-3)', 'hour_of_day', 'day', 'month', 'temp', 'precip',
       'windspeed', 'conditions', 'dt(n-1)'],
      dtype='object')

## Handling Missing Values

In [None]:
bus_run_data.isnull().sum()

trip_id                  64
deviceid                 64
direction                64
segment                  64
date                   2449
start_time             2449
end_time               2449
run_time               2449
run_time_in_seconds    2449
length                   64
day_of_week              64
time_of_day            2449
Sunday/holiday         2449
saturday                  0
weekday/end               0
week_no                  64
rt(w-1)                2449
rt(w-2)                2449
rt(w-3)                2449
rt(t-1)                2449
rt(t-2)                2449
rt(n-1)                1593
rt(n-2)                 835
rt(n-3)                 873
hour_of_day            2449
day                      64
month                    64
temp                   2449
precip                 2449
windspeed              2449
conditions             2449
dt(n-1)                   0
dtype: int64

In [None]:
# number of missing values given that target variable is missing
bus_run_data.loc[bus_run_data.run_time_in_seconds.isnull()].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
trip_id,2385.0,4538.745493,2814.948077,4.0,2155.0,3953.0,6963.0,10203.0
deviceid,2385.0,364.02348,377.194561,116.0,123.0,264.0,264.0,1410.0
direction,2385.0,1.462474,0.498694,1.0,1.0,1.0,2.0,2.0
segment,2385.0,17.683857,9.991683,1.0,9.0,14.0,27.0,34.0
run_time_in_seconds,0.0,,,,,,,
length,2385.0,1.118573,0.49691,0.2572,0.845,1.1483,1.3111,2.5612
day_of_week,2385.0,2.787841,1.865962,0.0,1.0,3.0,4.0,6.0
time_of_day,0.0,,,,,,,
Sunday/holiday,0.0,,,,,,,
saturday,2449.0,0.112291,0.315788,0.0,0.0,0.0,0.0,1.0


In [None]:
# dropping the NA data
bus_run_data = bus_run_data.dropna(subset=['run_time_in_seconds'])
bus_run_data.isnull().sum()

trip_id                0
deviceid               0
direction              0
segment                0
date                   0
start_time             0
end_time               0
run_time               0
run_time_in_seconds    0
length                 0
day_of_week            0
time_of_day            0
Sunday/holiday         0
saturday               0
weekday/end            0
week_no                0
rt(w-1)                0
rt(w-2)                0
rt(w-3)                0
rt(t-1)                0
rt(t-2)                0
rt(n-1)                0
rt(n-2)                0
rt(n-3)                0
hour_of_day            0
day                    0
month                  0
temp                   0
precip                 0
windspeed              0
conditions             0
dt(n-1)                0
dtype: int64

## Understanding the Data

bus_run_data.loc['date'] = pd.to_datetime(bus_run_data.date)
bus_run_data

In [None]:
bus_run_data.groupby(['segment','time_of_day']).run_time_in_seconds.mean().unstack()

time_of_day,4.25,4.50,4.75,5.00,5.25,5.50,5.75,6.00,6.25,6.50,...,21.00,21.25,21.50,21.75,22.00,22.25,22.50,23.00,23.25,23.50
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,110.0,,,,,92.54717,94.31,100.620853,...,72.5,81.333333,73.666667,,76.0,,,74.0,,
2.0,,,105.0,,,,,219.583333,228.28125,255.564767,...,176.933333,174.090909,162.666667,,135.0,,,150.0,,
3.0,,,255.0,,,,,504.5,417.37037,463.372727,...,326.3,346.1,344.6,,339.0,,,,336.0,
4.0,,,,854.0,,,,,204.836364,202.72381,...,173.59375,198.0,150.777778,180.5,150.0,,,,165.0,
5.0,,,,,104.0,,,,115.469388,117.914894,...,102.133333,94.636364,91.777778,105.0,,117.0,,,102.0,
6.0,,,,,155.0,,,,282.833333,280.848214,...,147.032258,144.25,149.111111,165.0,,142.0,,,121.0,
7.0,,,,,30.0,,,,44.0,44.215385,...,37.052632,39.714286,39.181818,54.666667,,45.0,,,45.0,
8.0,,,,,49.0,,,,62.0,78.31746,...,59.578947,58.428571,55.5,82.0,,45.0,,,,60.0
9.0,,,,,40.0,,,,15.0,31.589286,...,25.216216,24.529412,18.125,33.6,64.0,15.0,,,,15.0
10.0,,,,,166.0,,,,150.0,171.102041,...,115.527778,106.95,98.75,157.0,162.0,131.0,,,,90.0


In [None]:
bus_run_data.describe()

Unnamed: 0,trip_id,deviceid,direction,segment,run_time_in_seconds,length,day_of_week,time_of_day,Sunday/holiday,saturday,...,rt(n-1),rt(n-2),rt(n-3),hour_of_day,day,month,temp,precip,windspeed,dt(n-1)
count,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,...,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0,200679.0
mean,10641.386558,543.15747,1.482542,17.402005,188.129376,1.139525,2.837317,12.470151,0.15018,0.143777,...,190.763174,192.328091,201.815995,12.093846,16.012204,7.452633,24.186283,0.256384,8.525696,30.056261
std,8521.651258,510.331311,0.499696,10.626306,125.167081,0.532206,1.951688,3.777001,0.357249,0.350864,...,126.612051,125.433514,133.019926,3.786979,8.718276,4.288811,3.126348,2.021713,6.100737,49.570388
min,1.0,116.0,1.0,1.0,3.0,0.2572,0.0,4.25,0.0,0.0,...,4.0,4.0,3.0,4.0,1.0,1.0,14.0,0.0,0.0,0.0
25%,3923.0,128.0,1.0,8.0,105.0,0.84,1.0,9.25,0.0,0.0,...,107.0,111.0,112.0,9.0,9.0,2.0,22.3,0.0,3.9,0.0
50%,7439.0,274.0,1.0,15.0,162.0,1.15,3.0,12.5,0.0,0.0,...,162.0,163.0,170.0,12.0,16.0,10.0,24.4,0.0,7.9,15.0
75%,21478.0,1143.0,2.0,27.0,233.0,1.3115,5.0,15.5,0.0,0.0,...,233.0,240.0,253.0,15.0,24.0,11.0,26.3,0.0,11.9,32.0
max,25370.0,1719.0,2.0,34.0,1199.0,2.5612,6.0,23.5,1.0,1.0,...,1199.0,1199.0,1199.0,23.0,31.0,12.0,35.7,50.278,31.0,978.0


In [None]:
bus_run_data.loc[:74039].month.value_counts()

1.0     17704
12.0    16576
2.0     15731
11.0    15320
10.0     7385
Name: month, dtype: int64

In [None]:
bus_run_data.loc[bus_run_data.direction == 2]

Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,rt(n-2),rt(n-3),hour_of_day,day,month,temp,precip,windspeed,conditions,dt(n-1)
74040,3.0,274.0,2.0,21.0,2021-10-01,06:58:03,07:03:16,0 days 00:05:13,313.0,1.9418,...,354.0,354.0,6.0,1.0,10.0,20.0,0.0,6.1,Partially cloudy,0.0
74041,3.0,274.0,2.0,22.0,2021-10-01,07:04:00,07:06:03,0 days 00:02:03,123.0,0.9166,...,116.0,116.0,7.0,1.0,10.0,20.7,0.0,6.8,Partially cloudy,44.0
74042,3.0,274.0,2.0,23.0,2021-10-01,07:08:34,07:12:28,0 days 00:03:54,234.0,1.1479,...,313.0,175.0,7.0,1.0,10.0,20.7,0.0,6.8,Partially cloudy,151.0
74043,3.0,274.0,2.0,24.0,2021-10-01,07:13:41,07:17:16,0 days 00:03:35,215.0,1.3115,...,123.0,313.0,7.0,1.0,10.0,20.7,0.0,6.8,Partially cloudy,73.0
74044,3.0,274.0,2.0,25.0,2021-10-01,07:17:16,07:20:37,0 days 00:03:21,201.0,1.2594,...,234.0,123.0,7.0,1.0,10.0,20.7,0.0,6.8,Partially cloudy,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203123,25367.0,1377.0,2.0,30.0,2022-11-01,18:17:36,18:24:21,0:06:45,405.0,2.5600,...,150.0,105.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0
203124,25367.0,1377.0,2.0,31.0,2022-11-01,18:24:21,18:25:36,0:01:15,75.0,0.4200,...,195.0,150.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0
203125,25367.0,1377.0,2.0,32.0,2022-11-01,18:25:49,18:28:10,0:02:21,141.0,1.3000,...,405.0,195.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,13.0
203126,25367.0,1377.0,2.0,33.0,2022-11-01,18:28:10,18:31:25,0:03:15,195.0,1.2200,...,75.0,405.0,18.0,1.0,11.0,20.5,0.0,8.3,Overcast,0.0


## Separating into Training and Test Data

In [None]:
date = '2022-10-15'
train = bus_run_data.loc[bus_run_data.date < date]
test = bus_run_data.loc[bus_run_data.date >= date]

In [None]:
np.shape(train)[0]

185500

In [None]:
np.shape(test)[0]

15179

In [None]:
print("Portion taken for training = ",round(np.shape(train)[0]/ np.shape(bus_run_data)[0] * 100, 2),"%")

Portion taken for training =  92.44 %


In [None]:
# list of columns used for variables
list_of_col = ['deviceid','segment','length','direction',
 'month','day','day_of_week',
 'time_of_day',
 'dt(n-1)','rt(w-1)','rt(w-2)','rt(w-3)','rt(t-1)','rt(t-2)','rt(n-1)','rt(n-2)','rt(n-3)',
 'precip','windspeed']

In [None]:
len(list_of_col)

19

In [None]:
# Training Data
# Split data into features and target
train_X = train[list_of_col]
train_y = train['run_time_in_seconds']

In [None]:
# Testing Data
# Split data into features and target
test_X = test[list_of_col]
test_y = test['run_time_in_seconds']

## Designing the Model

In [None]:
train_X

Unnamed: 0,deviceid,segment,length,direction,month,day,day_of_week,time_of_day,dt(n-1),rt(w-1),rt(w-2),rt(w-3),rt(t-1),rt(t-2),rt(n-1),rt(n-2),rt(n-3),precip,windspeed
0,262.0,1.0,0.6261,1.0,10.0,1.0,4.0,6.50,0.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,0.0,6.1
1,262.0,2.0,1.2808,1.0,10.0,1.0,4.0,6.50,74.0,247.0,247.0,247.0,247.0,247.0,69.0,247.0,247.0,0.0,6.1
2,262.0,3.0,2.1125,1.0,10.0,1.0,4.0,6.75,0.0,506.0,506.0,506.0,506.0,506.0,210.0,69.0,506.0,0.0,6.1
3,262.0,4.0,1.5513,1.0,10.0,1.0,4.0,6.75,6.0,192.0,192.0,192.0,192.0,192.0,496.0,210.0,69.0,0.0,6.1
4,262.0,5.0,0.8450,1.0,10.0,1.0,4.0,6.75,0.0,114.0,114.0,114.0,114.0,114.0,195.0,496.0,210.0,0.0,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195743,262.0,30.0,2.5600,2.0,10.0,14.0,4.0,18.50,44.0,401.0,401.0,429.0,401.0,386.0,178.0,119.0,58.0,0.0,7.9
195744,262.0,31.0,0.4200,2.0,10.0,14.0,4.0,18.75,15.0,65.0,65.0,65.0,45.0,65.0,385.0,178.0,119.0,0.0,7.9
195745,262.0,32.0,1.3000,2.0,10.0,14.0,4.0,18.75,0.0,172.0,172.0,190.0,195.0,172.0,53.0,385.0,178.0,0.0,7.9
195746,262.0,33.0,1.2200,2.0,10.0,14.0,4.0,18.75,31.0,200.0,200.0,206.0,176.0,170.0,180.0,53.0,385.0,0.0,7.9


In [None]:
# Train a model to predict running times for the next segment
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    objective='reg:squarederror',
    random_state=42
)

model.fit(train_X, train_y)

## Evaluating Errors

In [None]:
def print_error_metrics_pred_vs_actual(pred_y, actual_y):
    print("MAE  : ", mean_absolute_error(pred_y, actual_y))
    print("RMSE : ", (mean_squared_error(pred_y, actual_y))**0.5)
    print("MAPE : ", mean_absolute_percentage_error(pred_y, actual_y)*10 ,"%")

### Training Error

In [None]:
train_pred_y = model.predict(train_X)

In [None]:
print("Training Error")
print_error_metrics_pred_vs_actual(train_pred_y, train_y)

Training Error
MAE  :  33.024672536428405
RMSE :  55.28113309369313
MAPE :  1.80620740789259 %


### Validation Error

In [None]:
test_pred_y = model.predict(test_X)

In [None]:
test_pred_y

array([109.99014, 224.07153, 410.4219 , ..., 188.5586 , 202.58563,
        91.61186], dtype=float32)

In [None]:
print("Validation Error")
print_error_metrics_pred_vs_actual(test_pred_y, test_y)

Validation Error
MAE  :  33.583362060382015
RMSE :  52.86708215775876
MAPE :  1.9110443236662153 %


## Performing Mulit-Step Predictions

### Helper functions

In [None]:
def searching_historical(data, time_of_day, segment):
    try:
        return round(data.loc[(time_of_day, segment)][0])
    except KeyError:
        return None

In [None]:
def searching_historical_avg_time(dataset, time_of_day, segment):
    data = pd.DataFrame(dataset.groupby(['time_of_day','segment']).run_time_in_seconds.mean())
    avg_time = searching_historical(data, time_of_day, segment)
    if avg_time != None:
        return avg_time
    else:
        nearby_timeslots = [time_of_day - 0.25, time_of_day + 0.25, time_of_day - 0.5, time_of_day + 0.5]
        for t in nearby_timeslots:
            avg_time = searching_historical(data, t, segment)
            if avg_time != None:
                return avg_time
        data2 = pd.DataFrame(dataset.groupby(['segment']).run_time_in_seconds.mean())
        return round(data.loc[segment][0])


In [None]:
def searching_historical_weekly_avg_time(dataset, week_no, segment, time_of_day):
    data = pd.DataFrame(dataset.groupby(['week_no','segment','time_of_day']).run_time_in_seconds.mean())
    try:
        avg_time = data.loc[(week_no, segment, time_of_day)][0]
    except KeyError:
        avg_time = search_nearby_historical(dataset, week_no, segment, time_of_day)

    if avg_time != None:
        return round(avg_time)
    else:
        return searching_historical_avg_time(bus_run_data, time_of_day, segment)

In [None]:
def searching_historical_weekly(dataset, week_no, segment, time_of_day):
    data = pd.DataFrame(dataset.groupby(['week_no','segment','time_of_day']).run_time_in_seconds.mean())
    try:
        avg_time = data.loc[(week_no, segment, time_of_day)][0]
    except KeyError:
        return None
    return round(avg_time)

In [None]:
def search_nearby_historical(dataset, week_no, segment, time_of_day):
    nearby_timeslots = []
    time_slot = time_of_day
    while (time_slot - time_of_day <= 2):
        nearby_timeslots.append(time_slot - 0.25)
        nearby_timeslots.append(time_slot + 0.25)
        time_slot += 0.25
    # print(nearby_timeslots)

#     return nearby_timeslots
    for nearby_timeslot in nearby_timeslots:
        avg_time = searching_historical_weekly(dataset, week_no, segment, nearby_timeslot)
        if avg_time is not None:
            return avg_time

    return None
#     return nearby_timeslots

In [None]:
def search_segment_length(segment_id):
    data = pd.DataFrame(bus_run_data.groupby('segment').length.unique().apply(lambda x: x[0]))

    try:
        return data.loc[segment_id][0]
    except KeyError:
        return None

In [None]:
# function to convert time_object into hours in integer form
def time_to_hour(time_obj):
    # the hour part as integer
    decimal_hour = time_obj.hour

    # adding the minute portion as the decimal portion
    if time_obj.minute < 15:
        decimal_hour += 0
    elif time_obj.minute < 30:
        decimal_hour += 0.25
    elif time_obj.minute < 45:
        decimal_hour += 0.5
    else:
        decimal_hour += 0.75
    return decimal_hour

In [None]:
# obtaining the last segment to calculate
def get_last_segment(segment):
    if segment <= 15:
        return 15
    elif segment <= 34:
        return 34
    else:
        return None

In [None]:
# Function to predict running times for multiple subsequent segments
def predict_multiple_segments(model, features):

    # list to save the predicted running times
    predicted_times = []

    # extracting the features from the data row
    max_segment = get_last_segment(features['segment'])

    timeslot = features['time_of_day']  # Initialize previous hour
    curr_time = pd.to_datetime(features['start_time'])
    week_no = features['week_no']

    # forming the data to be sent for model prediction
    features = {key: features[key] for key in list_of_col}

    # apply prediction and save to list
    predicted_time = model.predict(pd.DataFrame([features]))[0]
    predicted_times.append(round(predicted_time))

    # applying for each segment of the route
    while (features['segment'] < max_segment):

        # update the next segment number
        features['segment'] +=1

        # Update features for the next segment
        features['rt(n-3)'] = features['rt(n-2)']
        features['rt(n-2)'] = features['rt(n-1)']
        features['rt(n-1)'] = predicted_time

        # update the current time
        running_minutes = predicted_time // 60
        running_seconds = round(predicted_time % 60)
        curr_time = curr_time + datetime.timedelta(seconds = round(predicted_time))

        # update the timeslot for next segment
        while (time_to_hour(curr_time) > timeslot):
            timeslot += 0.25
            features['time_of_day'] = (features['time_of_day'] + 0.25) % 24  # Increment hour

        # update rt(t-k) values
        features['rt(t-1)'] = searching_historical_avg_time(bus_run_data, timeslot - 0.25, features['segment'])
        features['rt(t-2)'] = searching_historical_avg_time(bus_run_data, timeslot - 0.5, features['segment'])


        # update rt(w-k) values
        if week_no > 3:
            features['rt(w-1)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 1, features['segment'], timeslot)
            features['rt(w-2)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 2, features['segment'], timeslot)
            features['rt(w-3)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 3, features['segment'], timeslot)
        elif week_no > 2:
            features['rt(w-1)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 1, features['segment'], timeslot)
            features['rt(w-2)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 2, features['segment'], timeslot)
            features['rt(w-3)'] = features['rt(w-2)']
        elif week_no > 1:
            features['rt(w-1)'] = searching_historical_weekly_avg_time(bus_run_data, week_no - 1, features['segment'], timeslot)
            features['rt(w-2)'] = features['rt(w-1)']
            features['rt(w-3)'] = features['rt(w-2)']
        else:
            features['rt(w-1)'] = searching_historical_weekly_avg_time(bus_run_data, week_no, features['segment'], timeslot)
            features['rt(w-2)'] = features['rt(w-1)']
            features['rt(w-3)'] = features['rt(w-2)']
#
        # update length of segment
        features['length'] = search_segment_length(features['segment'])

#         print(features)

        # apply prediction and save to list
        predicted_time = model.predict(pd.DataFrame([features]))[0]
        predicted_times.append(round(predicted_time))

#         print('')
    return predicted_times


In [None]:
list_of_col.append('start_time')
list_of_col.append('week_no')

start_index = 200023
curr_segment = bus_run_data.loc[start_index, 'segment']
max_segment = get_last_segment(curr_segment)
end_index = start_index + max_segment - curr_segment

initial_features = dict(bus_run_data.loc[start_index,list_of_col])

list_of_col.remove('start_time')
list_of_col.remove('week_no')

# Predict running times for multiple segments
predicted_segment_times = predict_multiple_segments(model, initial_features)
print("Predicted Running Times for Subsequent Segments:", predicted_segment_times)
print('')

actual_segment_times = list(bus_run_data.loc[start_index:end_index,'run_time_in_seconds'])
print("Actual Running Times for Subsequent Segments:", actual_segment_times)
print('')

print_error_metrics_pred_vs_actual(predicted_segment_times, actual_segment_times)
print('')

start_time = bus_run_data.loc[start_index, 'start_time']
start_time = pd.to_datetime(start_time)
curr_time = datetime.datetime(2023,8,23,start_time.hour, start_time.minute, start_time.second)

for seg_time in predicted_segment_times:
    segment_delta = datetime.timedelta(seconds=round(seg_time))
    curr_time = curr_time + segment_delta
    print(curr_segment,' - ' , curr_time)
    curr_segment+=1

Predicted Running Times for Subsequent Segments: [63, 138, 211, 411, 69, 303, 306, 133]

Actual Running Times for Subsequent Segments: [61.0, 175.0, 203.0, 271.0, 57.0, 725.0, 379.0, 153.0]

MAE  :  89.25
RMSE :  160.0757633122516
MAPE :  3.292499522353771 %

27.0  -  2023-08-23 13:23:46
28.0  -  2023-08-23 13:26:04
29.0  -  2023-08-23 13:29:35
30.0  -  2023-08-23 13:36:26
31.0  -  2023-08-23 13:37:35
32.0  -  2023-08-23 13:42:38
33.0  -  2023-08-23 13:47:44
34.0  -  2023-08-23 13:49:57


# Analyzing the Error for each Time Step

In [None]:
def update_time(row):
    row['start_time'] = pd.to_datetime(row['start_time'])
    return row['start_time'] + datetime.timedelta(seconds = round(row['rt(n-1)']))

In [None]:
def update_week1_average_run_time(row):
    week_no = row['week_no'] - 1
    segment = row['segment']
    time_of_day = row['time_of_day']

    return searching_historical_weekly_avg_time(bus_run_data, week_no, segment, time_of_day)

def update_week2_average_run_time(row):
    week_no = row['week_no'] - 2
    segment = row['segment']
    time_of_day = row['time_of_day']

    return searching_historical_weekly_avg_time(bus_run_data, week_no, segment, time_of_day)

def update_week3_average_run_time(row):
    week_no = row['week_no'] - 3
    segment = row['segment']
    time_of_day = row['time_of_day']

    return searching_historical_weekly_avg_time(bus_run_data, week_no, segment, time_of_day)

In [None]:
def update_time1_avg(row):
    time_of_day = row['time_of_day'] - 0.25
    segment = row['segment']

    return searching_historical_avg_time(bus_run_data, time_of_day, segment)

def update_time2_avg(row):
    time_of_day = row['time_of_day'] - 0.5
    segment = row['segment']

    return searching_historical_avg_time(bus_run_data, time_of_day, segment)

In [None]:
print("Error for 2nd Time Step")

test_copy = test.copy()
k = 2
for i in range(1, k):
    test11 = test_copy.copy()
    test11['next_run_time'] = test11['run_time_in_seconds'].shift(-k+1)
    test11 = test11.loc[((test11.segment > 20) & (test11.segment <= 34-i)) | (test11.segment <= 15-i)]
    test11['length'] = test11['segment'].apply(search_segment_length)

    test11['rt(n-3)'] = test11['rt(n-2)']
    test11['rt(n-2)'] = test11['rt(n-1)']
    test11['rt(n-1)'] = model.predict(test11[list_of_col])

    test11['start_time'] = test11.apply(update_time, axis=1)

    test11['rt(w-3)'] = test11.apply(update_week3_average_run_time, axis=1)
    test11['rt(w-2)'] = test11.apply(update_week2_average_run_time, axis=1)
    test11['rt(w-1)'] = test11.apply(update_week1_average_run_time, axis=1)

    test11['rt(t-1)'] = test11.apply(update_time1_avg, axis=1)
    test11['rt(t-2)'] = test11.apply(update_time2_avg, axis=1)

    print_error_metrics_pred_vs_actual(model.predict(test11[list_of_col]), test11['next_run_time'])

    test_copy = test11

Error for 2nd Time Step
MAE  :  118.2868419664765
RMSE :  152.51881898041896
MAPE :  8.258429418158878 %


In [None]:
print("Error for 3rd Time Step")

k1 = 3
for i in range(k, k1):
    test11 = test_copy.copy()
    test11['next_run_time'] = test11['run_time_in_seconds'].shift(-k+1)
    test11 = test11.loc[((test11.segment > 20) & (test11.segment <= 34-i)) | (test11.segment <= 15-i)]
    test11['length'] = test11['segment'].apply(search_segment_length)

    test11['rt(n-3)'] = test11['rt(n-2)']
    test11['rt(n-2)'] = test11['rt(n-1)']
    test11['rt(n-1)'] = model.predict(test11[list_of_col])

    test11['start_time'] = test11.apply(update_time, axis=1)

    test11['rt(w-3)'] = test11.apply(update_week3_average_run_time, axis=1)
    test11['rt(w-2)'] = test11.apply(update_week2_average_run_time, axis=1)
    test11['rt(w-1)'] = test11.apply(update_week1_average_run_time, axis=1)

    test11['rt(t-1)'] = test11.apply(update_time1_avg, axis=1)
    test11['rt(t-2)'] = test11.apply(update_time2_avg, axis=1)

    print_error_metrics_pred_vs_actual(model.predict(test11[list_of_col]), test11['next_run_time'])

    test_copy = test11

Error for 3rd Time Step
MAE  :  114.70011775714707
RMSE :  149.0048740690877
MAPE :  8.202222557846854 %
