In [69]:
import pandas as pd
from datetime import datetime
import numpy as np

# loading the static data
calendarDates = pd.read_csv('static/calendar_dates.csv')
routes = pd.read_csv('static/routes.csv')
shapes = pd.read_csv('static/shapes.csv')
stopTimes = pd.read_csv('static/stop_times.csv')
stops = pd.read_csv('static/stops.csv')
trips = pd.read_csv('static/trips.csv')

In [70]:
# converting the arrival and departure times to datetime timedeltas
stopTimes['arrival_time'] = pd.to_timedelta(stopTimes['arrival_time'])
stopTimes['departure_time'] = pd.to_timedelta(stopTimes['departure_time'])

# dwell time calculation
stopTimes['dwell_time'] = stopTimes['departure_time'] - stopTimes['arrival_time']
stopTimes['dwell_time'] = stopTimes['dwell_time'].dt.total_seconds()

stopTimes.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,dwell_time
0,6801326_20121,0 days 07:00:00,0 days 07:00:00,PF_A15_C,1,0,0,0.0,0.0
1,6801326_20121,0 days 07:04:00,0 days 07:04:00,PF_A14_C,2,0,0,2.671,0.0
2,6801326_20121,0 days 07:07:00,0 days 07:07:00,PF_A13_C,3,0,0,4.7758,0.0
3,6801326_20121,0 days 07:09:00,0 days 07:09:00,PF_A12_C,4,0,0,5.8746,0.0
4,6801326_20121,0 days 07:12:00,0 days 07:12:00,PF_A11_C,5,0,0,7.3282,0.0


In [71]:
# generating the link information and saving it to a csv file
links = []
for i in range(len(stopTimes) - 1):
    if stopTimes.loc[i, 'trip_id'] == stopTimes.loc[i + 1, 'trip_id']:
        start_stop = stopTimes.loc[i, 'stop_id']
        end_stop = stopTimes.loc[i + 1, 'stop_id']
        start_sequence = stopTimes.loc[i, 'stop_sequence']
        end_sequence = stopTimes.loc[i + 1, 'stop_sequence']
        length = stopTimes.loc[i + 1, 'shape_dist_traveled'] - stopTimes.loc[i, 'shape_dist_traveled']

        # Calculate time difference in minutes
        start_time = stopTimes.loc[i, 'departure_time']
        end_time = stopTimes.loc[i + 1, 'arrival_time']
        scheduled_time_taken = (end_time - start_time).total_seconds() / 60 

        links.append({
            'trip_id': stopTimes.loc[i, 'trip_id'],
            'start_stop': start_stop,
            'end_stop': end_stop,
            'start_sequence': start_sequence,
            'end_sequence': end_sequence,
            'length': length,
            'scheduled_time_taken [mins]': scheduled_time_taken,
            'expected_speed [km/h]': length / (scheduled_time_taken / 60),
            'start_time': start_time,
            'end_time': end_time
        })

linksDf = pd.DataFrame(links)
linksDf.to_csv('static/links.csv', index=False)
linksDf

Unnamed: 0,trip_id,start_stop,end_stop,start_sequence,end_sequence,length,scheduled_time_taken [mins],expected_speed [km/h],start_time,end_time
0,6801326_20121,PF_A15_C,PF_A14_C,1,2,2.6710,4.0,40.0650,0 days 07:00:00,0 days 07:04:00
1,6801326_20121,PF_A14_C,PF_A13_C,2,3,2.1048,3.0,42.0960,0 days 07:04:00,0 days 07:07:00
2,6801326_20121,PF_A13_C,PF_A12_C,3,4,1.0988,2.0,32.9640,0 days 07:07:00,0 days 07:09:00
3,6801326_20121,PF_A12_C,PF_A11_C,4,5,1.4536,3.0,29.0720,0 days 07:09:00,0 days 07:12:00
4,6801326_20121,PF_A11_C,PF_A10_C,5,6,2.1612,4.0,32.4180,0 days 07:12:00,0 days 07:16:00
...,...,...,...,...,...,...,...,...,...,...
184021,6556673_20121,PF_D08_C,PF_G01_C,29,30,2.5811,5.0,30.9732,1 days 00:19:00,1 days 00:24:00
184022,6556673_20121,PF_G01_C,PF_G02_C,30,31,1.3466,3.0,26.9320,1 days 00:24:00,1 days 00:27:00
184023,6556673_20121,PF_G02_C,PF_G03_C,31,32,0.9846,2.0,29.5380,1 days 00:27:00,1 days 00:29:00
184024,6556673_20121,PF_G03_C,PF_G04_C,32,33,1.7288,3.0,34.5760,1 days 00:29:00,1 days 00:32:00


In [72]:
# filtering the unique links
uniqueLinks = linksDf.drop_duplicates(subset=['start_stop', 'end_stop'])[['start_stop', 'end_stop','start_sequence', 'end_sequence', 'length']]
uniqueLinks.reset_index(drop=True, inplace=True)
uniqueLinks

Unnamed: 0,start_stop,end_stop,start_sequence,end_sequence,length
0,PF_A15_C,PF_A14_C,1,2,2.6710
1,PF_A14_C,PF_A13_C,2,3,2.1048
2,PF_A13_C,PF_A12_C,3,4,1.0988
3,PF_A12_C,PF_A11_C,4,5,1.4536
4,PF_A11_C,PF_A10_C,5,6,2.1612
...,...,...,...,...,...
197,PF_N11_C,PF_N10_C,2,3,2.8733
198,PF_N10_C,PF_N09_C,3,4,2.0799
199,PF_N09_C,PF_N08_C,4,5,1.7224
200,PF_N08_C,PF_N07_C,5,6,1.3477


In [73]:
# unique nodes and average dwell time
uniqueNodes = stopTimes.drop_duplicates(subset=['stop_id'])[['stop_id']]
uniqueNodes.reset_index(drop=True, inplace=True)
uniqueNodes['average_dwell_time [mins]'] = stopTimes.groupby('stop_id')['dwell_time'].mean() / 60
uniqueNodes

Unnamed: 0,stop_id,average_dwell_time [mins]
0,PF_A15_C,
1,PF_A14_C,
2,PF_A13_C,
3,PF_A12_C,
4,PF_A11_C,
...,...,...
120,PF_N08_C,
121,PF_N09_C,
122,PF_N10_C,
123,PF_N11_C,


In [74]:
# iterating across the unique links to calculate the averages of the link data

for index, row in uniqueLinks.iterrows():
    link = linksDf[(linksDf['start_stop'] == row['start_stop']) & (linksDf['end_stop'] == row['end_stop'])]

    mean_speed = link['expected_speed [km/h]'].mean()
    covariance_speed = link['expected_speed [km/h]'].std() / mean_speed
    buffer_speed = link['expected_speed [km/h]'].quantile(0.95) - link['expected_speed [km/h]'].quantile(0.5)

    mean_time = link['scheduled_time_taken [mins]'].mean()
    covariance_time = link['scheduled_time_taken [mins]'].std() / mean_time
    buffer_time = link['scheduled_time_taken [mins]'].quantile(0.95) - link['scheduled_time_taken [mins]'].quantile(0.5)

    uniqueLinks.loc[index, 'mean_speed [km/h]'] = mean_speed
    uniqueLinks.loc[index, 'covariance_speed [%]'] = covariance_speed
    uniqueLinks.loc[index, 'buffer_speed [km/h]'] = buffer_speed
    uniqueLinks.loc[index, 'mean_time [mins]'] = mean_time
    uniqueLinks.loc[index, 'covariance_time [%]'] = covariance_time
    uniqueLinks.loc[index, 'buffer_time [mins]'] = buffer_time

    # calculating the mean headway between trips
    headways = []
    trips = stopTimes[stopTimes['stop_id'] == row['start_stop']]
    trips = trips.drop_duplicates(subset=['trip_id'])
    trips = trips.sort_values(by='departure_time')
    trips = trips.drop_duplicates(subset=['departure_time'])
    for i in range(len(trips) - 1):
        start_time = trips.iloc[i]['departure_time']
        end_time = trips.iloc[i + 1]['departure_time']
        headways.append((end_time - start_time).total_seconds() / 60)

    if headways:
        mean_headway = pd.Series(headways).mean()
        covariance_headway = pd.Series(headways).std() / mean_headway
        buffer_headway = pd.Series(headways).quantile(0.95) - pd.Series(headways).quantile(0.5)
        uniqueLinks.loc[index, 'mean_headway [mins]'] = mean_headway
        uniqueLinks.loc[index, 'covariance_headway [%]'] = covariance_headway
        uniqueLinks.loc[index, 'buffer_headway [mins]'] = buffer_headway
    else:
        uniqueLinks.loc[index, 'mean_headway [mins]'] = None
        uniqueLinks.loc[index, 'covariance_headway [%]'] = None
        uniqueLinks.loc[index, 'buffer_headway [mins]'] = None


In [75]:
uniqueLinks.to_csv('static/unique_links.csv', index=False)
uniqueLinks

Unnamed: 0,start_stop,end_stop,start_sequence,end_sequence,length,mean_speed [km/h],covariance_speed [%],buffer_speed [km/h],mean_time [mins],covariance_time [%],buffer_time [mins],mean_headway [mins],covariance_headway [%],buffer_headway [mins]
0,PF_A15_C,PF_A14_C,1,2,2.6710,40.065000,1.774532e-16,0.0,4.0,0.0,0.0,2.171030,0.594617,3.00
1,PF_A14_C,PF_A13_C,2,3,2.1048,42.096000,1.688916e-16,0.0,3.0,0.0,0.0,2.684783,0.605075,4.00
2,PF_A13_C,PF_A12_C,3,4,1.0988,32.964000,0.000000e+00,0.0,2.0,0.0,0.0,2.654428,0.656903,4.00
3,PF_A12_C,PF_A11_C,4,5,1.4536,29.072000,0.000000e+00,0.0,3.0,0.0,0.0,2.328897,0.579458,2.00
4,PF_A11_C,PF_A10_C,5,6,2.1612,32.418000,2.193121e-16,0.0,4.0,0.0,0.0,2.317490,0.577380,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,PF_N11_C,PF_N10_C,2,3,2.8733,34.479600,0.000000e+00,0.0,5.0,0.0,0.0,2.990544,0.902491,7.00
198,PF_N10_C,PF_N09_C,3,4,2.0799,31.198628,4.425913e-05,0.0,4.0,0.0,0.0,2.778022,0.795745,5.00
199,PF_N09_C,PF_N08_C,4,5,1.7224,34.447983,5.344577e-06,0.0,3.0,0.0,0.0,2.941452,0.909301,6.00
200,PF_N08_C,PF_N07_C,5,6,1.3477,26.954000,5.260984e-16,0.0,3.0,0.0,0.0,2.906977,0.817076,5.55


In [76]:
# # Doing the same as above, but only for a 15 minute window at a time such that all values can have a weighted and unweighted average calculated
# # This is done to ensure that the values are not skewed by the time of day

# # calculating the total time period of the data, then calculating the number of 15 minute windows
# start = stopTimes['departure_time'].min()
# finish = stopTimes['departure_time'].max()
# totalTime = (finish - start).total_seconds() / 60
# totalWindows = int(np.ceil(totalTime / 15))

# # speed arrays
# meanSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightMeanSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))

# covarianceSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightCovarianceSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))

# bufferSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightBufferSpeedWindows = np.zeros((len(uniqueLinks), totalWindows))

# # time arrays
# meanTimeWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightMeanTimeWindows = np.zeros((len(uniqueLinks), totalWindows))

# covarianceTimeWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightCovarianceTimeWindows = np.zeros((len(uniqueLinks), totalWindows))

# bufferTimeWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightBufferTimeWindows = np.zeros((len(uniqueLinks), totalWindows))

# # headway arrays
# meanHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightMeanHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))

# covarianceHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightCovarianceHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))

# bufferHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))
# weightBufferHeadwayWindows = np.zeros((len(uniqueLinks), totalWindows))

# for index, row in uniqueLinks.iterrows():
#     link = linksDf[(linksDf['start_stop'] == row['start_stop']) & (linksDf['end_stop'] == row['end_stop'])]

#     for i in range(totalWindows):
#         startTime = start + pd.Timedelta(minutes=i*15)
#         endTime = start + pd.Timedelta(minutes=(i+1)*15)
#         linkWindow = link[(link['start_time'] >= startTime) & (link['end_time'] <= endTime)]
#         print(linkWindow)
        
#         meanSpeedWindows[index, i] = linkWindow['expected_speed [km/h]'].mean()
#         weightMeanSpeedWindows[index, i] = (linkWindow['expected_speed [km/h]'].mean() * len(linkWindow))

#         covarianceSpeedWindows[index, i] = linkWindow['expected_speed [km/h]'].std() / meanSpeedWindows[index, i]
#         weightCovarianceSpeedWindows[index, i] = (linkWindow['expected_speed [km/h]'].std() / meanSpeedWindows[index, i]) * len(linkWindow)
        
#         bufferSpeedWindows[index, i] = linkWindow['expected_speed [km/h]'].quantile(0.95) - linkWindow['expected_speed [km/h]'].quantile(0.5)
#         weightBufferSpeedWindows[index, i] = (linkWindow['expected_speed [km/h]'].quantile(0.95) - linkWindow['expected_speed [km/h]'].quantile(0.5)) * len(linkWindow)

#         meanTimeWindows[index, i] = linkWindow['scheduled_time_taken [mins]'].mean()
#         weightMeanTimeWindows[index, i] = linkWindow['scheduled_time_taken [mins]'].mean() * len(linkWindow)

#         covarianceTimeWindows[index, i] = linkWindow['scheduled_time_taken [mins]'].std() / meanTimeWindows[index, i]
#         weightCovarianceTimeWindows[index, i] = (linkWindow['scheduled_time_taken [mins]'].std() / meanTimeWindows[index, i]) * len(linkWindow)
        
#         bufferTimeWindows[index, i] = linkWindow['scheduled_time_taken [mins]'].quantile(0.95) - linkWindow['scheduled_time_taken [mins]'].quantile(0.5)
#         weightBufferTimeWindows[index, i] = (linkWindow['scheduled_time_taken [mins]'].quantile(0.95) - linkWindow['scheduled_time_taken [mins]'].quantile(0.5)) * len(linkWindow)

#         headways = []
#         trips = stopTimes[stopTimes['stop_id'] == row['start_stop']]
#         trips = trips.drop_duplicates(subset=['trip_id'])
#         trips = trips.sort_values(by='departure_time')
#         trips = trips.drop_duplicates(subset=['departure_time'])
#         trips = trips[(trips['departure_time'] >= startTime) & (trips['departure_time'] <= endTime)]
#         for j in range(len(trips) - 1):
#             startTime = trips.iloc[j]['departure_time']
#             endTime = trips.iloc[j + 1]['departure_time']
#             headways.append((endTime - startTime).total_seconds() / 60)

#         if headways:
#             meanHeadwayWindows[index, i] = pd.Series(headways).mean()
#             weightMeanHeadwayWindows[index, i] = pd.Series(headways).mean() * len(headways)
            
#             covarianceHeadwayWindows[index, i] = pd.Series(headways).std() / meanHeadwayWindows[index, i]
#             weightCovarianceHeadwayWindows[index, i] = (pd.Series(headways).std() / meanHeadwayWindows[index, i]) * len(headways)
            
#             bufferHeadwayWindows[index, i] = pd.Series(headways).quantile(0.95) - pd.Series(headways).quantile(0.5)
#             weightBufferHeadwayWindows[index, i] = (pd.Series(headways).quantile(0.95) - pd.Series(headways).quantile(0.5)) * len(headways)
#         else:
#             meanHeadwayWindows[index, i] = None
#             weightMeanHeadwayWindows[index, i] = None
            
#             covarianceHeadwayWindows[index, i] = None
#             weightCovarianceHeadwayWindows[index, i] = None
            
#             bufferHeadwayWindows[index, i] = None
#             weightBufferHeadwayWindows[index, i] = None
