In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# import data
stops_data = pd.read_csv('gtfs/stops.txt')
stop_times_data = pd.read_csv('gtfs/stop_times.txt')
routes_data = pd.read_csv('gtfs/routes.txt')

In [3]:
# get station sequences for each line
trip_stations = stop_times_data.merge(stops_data, how='left', on='stop_id')

In [4]:
# sort by lines
trip_stations = trip_stations.sort_values(by=['trip_id', 'stop_sequence']).reset_index()

In [5]:
# add next stop id -- make edges
trip_stations['target_stop_id'] = trip_stations['stop_id'].iloc[1:].reset_index(drop = True)

In [6]:
# now get the indices for last row of every group
out = trip_stations.groupby('trip_id').aggregate({"index": "last"})
out.sort_values(["index"])

Unnamed: 0_level_0,index
trip_id,Unnamed: 1_level_1
518.T0.11-WLB-j20-1.1.H,36
521.T0.11-WLB-j20-1.1.H,73
524.T0.11-WLB-j20-1.1.H,110
525.T0.11-WLB-j20-1.1.H,147
526.T0.11-WLB-j20-1.1.H,184
...,...
6419.T4.25-N91-j20-1.2.R,2238119
6420.T4.25-N91-j20-1.2.R,2238138
6421.T4.25-N91-j20-1.2.R,2238157
6422.T4.25-N91-j20-1.2.R,2238176


In [7]:
# make end stops not have continuation - special casing with NaN
trip_stations.loc[out["index"].values,'target_stop_id'] = np.NaN

In [8]:
#add route names

#add route ids to trip_stations
trip_stations['route_id'] = trip_stations['trip_id'].apply(lambda x: x.split('.'))
trip_stations['route_id'] = trip_stations['route_id'].apply(lambda x: x[2])

In [9]:
#merge on route_ids
trip_stations = trip_stations.merge(routes_data[['route_id', 'route_short_name']], how='left', on = 'route_id')

In [10]:
# convert times to floats seconds type for arrival times

trip_stations['arrival_time'] = trip_stations['arrival_time'].apply(lambda x: x.split(':'))
trip_stations['arrival_time'] = trip_stations['arrival_time'].apply(lambda x: int(x[0])*60**2 + int(x[1])*60 + int(x[2]))


In [11]:
# for departure times

trip_stations['departure_time'] = trip_stations['departure_time'].apply(lambda x: x.split(':'))
trip_stations['departure_time'] = trip_stations['departure_time'].apply(lambda x: int(x[0])*60**2 + int(x[1])*60 + int(x[2]))


In [28]:
#duration to the next stop
trip_stations.loc[:,'duration_to_next'] = trip_stations['arrival_time'].iloc[1:].reset_index(drop = True)

In [29]:
trip_stations.loc[:,'duration_to_next'] -= trip_stations['arrival_time']

In [30]:
trip_stations.loc[2238195, 'duration_to_next'] = 0

In [31]:
# trip_stations['duration_to_next'] = trip_stations['duration_to_next'].apply(dt.timedelta)

In [32]:
trip_stations.loc[out["index"].values,'duration_to_next']= np.NaN

In [33]:
trip_stations['duration_to_next']

0           60.0
1          180.0
2           60.0
3           60.0
4           60.0
           ...  
2238191     60.0
2238192    120.0
2238193     60.0
2238194    120.0
2238195      NaN
Name: duration_to_next, Length: 2238196, dtype: float64

In [34]:
trip_stations.columns

Index(['index', 'trip_id', 'arrival_time', 'departure_time', 'stop_id',
       'stop_sequence', 'pickup_type', 'drop_off_type', 'shape_dist_traveled',
       'stop_name', 'stop_lat', 'stop_lon', 'target_stop_id', 'route_id',
       'route_short_name', 'duration_to_next'],
      dtype='object')

In [35]:
edges = trip_stations.drop(out["index"].values)

In [36]:
edges = edges[[ 'stop_id', 'target_stop_id', 'trip_id', 'departure_time', 'route_short_name', 'duration_to_next']]
edges

Unnamed: 0,stop_id,target_stop_id,trip_id,departure_time,route_short_name,duration_to_next
0,at:43:3121:0:1,at:43:3134:0:2,1.T0.11-WLB-j20-1.14.R,86400,WLB,60.0
1,at:43:3134:0:2,at:43:3142:0:3,1.T0.11-WLB-j20-1.14.R,86460,WLB,180.0
2,at:43:3142:0:3,at:43:6055:0:1,1.T0.11-WLB-j20-1.14.R,86640,WLB,60.0
3,at:43:6055:0:1,at:43:5809:0:2,1.T0.11-WLB-j20-1.14.R,86700,WLB,60.0
4,at:43:5809:0:2,at:43:4536:0:2,1.T0.11-WLB-j20-1.14.R,86760,WLB,60.0
...,...,...,...,...,...,...
2238190,at:49:571:0:2,at:49:40:0:2,9999.TC.23-73A-j20-1.3.R,51060,73A,60.0
2238191,at:49:40:0:2,at:49:332:0:1,9999.TC.23-73A-j20-1.3.R,51120,73A,60.0
2238192,at:49:332:0:1,at:49:390:0:1,9999.TC.23-73A-j20-1.3.R,51180,73A,120.0
2238193,at:49:390:0:1,at:49:1756:0:1,9999.TC.23-73A-j20-1.3.R,51300,73A,60.0


In [37]:
# Write the edges to a CSV file
edges.to_csv("edges-new.csv", index=False)