In [None]:
import pandas as pd
from datetime import datetime

# loading the static data
calendarDates = pd.read_csv('static/calendar_dates.csv')
routes = pd.read_csv('static/routes.csv')
shapes = pd.read_csv('static/shapes.csv')
stopTimes = pd.read_csv('static/stop_times.csv')
stops = pd.read_csv('static/stops.csv')
trips = pd.read_csv('static/trips.csv')

In [None]:
# function to convert time to 24 hour format
def convert_to_24_hour_format(time_str):
    hours, minutes, seconds = map(int, time_str.split(':'))
    if hours >= 24:
        hours = hours % 24
    return f"{hours:02}:{minutes:02}:{seconds:02}"

# converting the time to 24 hour format
stopTimes['arrival_time'] = stopTimes['arrival_time'].apply(convert_to_24_hour_format)
stopTimes['departure_time'] = stopTimes['departure_time'].apply(convert_to_24_hour_format)

stopTimes.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
0,6801326_20121,07:00:00,07:00:00,PF_A15_C,1,0,0,0.0
1,6801326_20121,07:04:00,07:04:00,PF_A14_C,2,0,0,2.671
2,6801326_20121,07:07:00,07:07:00,PF_A13_C,3,0,0,4.7758
3,6801326_20121,07:09:00,07:09:00,PF_A12_C,4,0,0,5.8746
4,6801326_20121,07:12:00,07:12:00,PF_A11_C,5,0,0,7.3282


In [None]:
# generating the link information and saving it to a csv file
links = []
for i in range(len(stopTimes) - 1):
    if stopTimes.loc[i, 'trip_id'] == stopTimes.loc[i + 1, 'trip_id']:
        start_stop = stopTimes.loc[i, 'stop_id']
        end_stop = stopTimes.loc[i + 1, 'stop_id']
        start_sequence = stopTimes.loc[i, 'stop_sequence']
        end_sequence = stopTimes.loc[i + 1, 'stop_sequence']
        length = stopTimes.loc[i + 1, 'shape_dist_traveled'] - stopTimes.loc[i, 'shape_dist_traveled']

        # Calculate time difference in minutes
        time_format = "%H:%M:%S"
        start_time = datetime.strptime(stopTimes.loc[i, 'departure_time'], time_format)
        end_time = datetime.strptime(stopTimes.loc[i + 1, 'arrival_time'], time_format)
        scheduled_time_taken = (end_time - start_time).total_seconds() / 60 

        links.append({
            'trip_id': stopTimes.loc[i, 'trip_id'],
            'start_stop': start_stop,
            'end_stop': end_stop,
            'start_sequence': start_sequence,
            'end_sequence': end_sequence,
            'length': length,
            'time_taken [mins]': scheduled_time_taken
        })

linksDf = pd.DataFrame(links)
linksDf.to_csv('static/links.csv', index=False)
linksDf.head()

Unnamed: 0,trip_id,start_stop,end_stop,start_sequence,end_sequence,length,time_taken
0,6801326_20121,PF_A15_C,PF_A14_C,1,2,2.6710,4.0
1,6801326_20121,PF_A14_C,PF_A13_C,2,3,2.1048,3.0
2,6801326_20121,PF_A13_C,PF_A12_C,3,4,1.0988,2.0
3,6801326_20121,PF_A12_C,PF_A11_C,4,5,1.4536,3.0
4,6801326_20121,PF_A11_C,PF_A10_C,5,6,2.1612,4.0
...,...,...,...,...,...,...,...
184021,6556673_20121,PF_D08_C,PF_G01_C,29,30,2.5811,5.0
184022,6556673_20121,PF_G01_C,PF_G02_C,30,31,1.3466,3.0
184023,6556673_20121,PF_G02_C,PF_G03_C,31,32,0.9846,2.0
184024,6556673_20121,PF_G03_C,PF_G04_C,32,33,1.7288,3.0


In [45]:
stopTimes[['arrival_time', 'departure_time']] = stopTimes[['arrival_time', 'departure_time']].apply(pd.to_timedelta)
stopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
0,6801326_20121,0 days 07:00:00,0 days 07:00:00,PF_A15_C,1,0,0,0.0000
1,6801326_20121,0 days 07:04:00,0 days 07:04:00,PF_A14_C,2,0,0,2.6710
2,6801326_20121,0 days 07:07:00,0 days 07:07:00,PF_A13_C,3,0,0,4.7758
3,6801326_20121,0 days 07:09:00,0 days 07:09:00,PF_A12_C,4,0,0,5.8746
4,6801326_20121,0 days 07:12:00,0 days 07:12:00,PF_A11_C,5,0,0,7.3282
...,...,...,...,...,...,...,...,...
192467,6556673_20121,0 days 00:24:00,0 days 00:24:00,PF_G01_C,30,0,0,38.3862
192468,6556673_20121,0 days 00:27:00,0 days 00:27:00,PF_G02_C,31,0,0,39.7328
192469,6556673_20121,0 days 00:29:00,0 days 00:29:00,PF_G03_C,32,0,0,40.7174
192470,6556673_20121,0 days 00:32:00,0 days 00:32:00,PF_G04_C,33,0,0,42.4462
