In [65]:
import pandas as pd
import numpy as np
import datetime as dt

In [66]:
# import data
stops_data = pd.read_csv('gtfs/stops.txt')
stop_times_data = pd.read_csv('gtfs/stop_times.txt')
routes_data = pd.read_csv('gtfs/routes.txt')

#### merging stop_times data with stops

In [67]:
# get station sequences for each line
trip_stations = stop_times_data.merge(stops_data, how='left', on='stop_id')

In [68]:
# sort by lines
trip_stations = trip_stations.sort_values(by=['trip_id', 'stop_sequence']).reset_index()

In [69]:
# add next stop id -- make edges
trip_stations['target_stop_id'] = trip_stations['stop_id'].iloc[1:].reset_index(drop = True)

#### adding short route names (ie U1)

In [70]:
#add route names

#add route ids to trip_stations
trip_stations['route_id'] = trip_stations['trip_id'].apply(lambda x: x.split('.'))
trip_stations['route_id'] = trip_stations['route_id'].apply(lambda x: x[2])

In [71]:
#merge on route_ids
trip_stations = trip_stations.merge(routes_data[['route_id', 'route_short_name']], how='left', on = 'route_id')

#### times

In [72]:
# convert times to floats seconds type for arrival times

trip_stations['arrival_time'] = trip_stations['arrival_time'].apply(lambda x: x.split(':'))
trip_stations['arrival_time'] = trip_stations['arrival_time'].apply(lambda x: int(x[0])*60**2 + int(x[1])*60 + int(x[2]))


In [73]:
# for departure times

trip_stations['departure_time'] = trip_stations['departure_time'].apply(lambda x: x.split(':'))
trip_stations['departure_time'] = trip_stations['departure_time'].apply(lambda x: int(x[0])*60**2 + int(x[1])*60 + int(x[2]))


In [74]:
#duration to the next stop
trip_stations.loc[:,'duration_to_next'] = trip_stations['arrival_time'].iloc[1:].reset_index(drop = True)

In [75]:
trip_stations.loc[:,'duration_to_next'] -= trip_stations['arrival_time']

In [76]:
trip_stations.loc[2238195, 'duration_to_next'] = 0

#### special casing for final stops with nan

In [77]:
#drop current jumbled index
trip_stations = trip_stations.drop(columns='index')
# reset and add a new one
trip_stations = trip_stations.reset_index(drop=False)
# now get the indices for last stop of every group (trip id)
out = trip_stations.groupby('trip_id').aggregate({"index": "last"})
out = out.sort_values(["index"])

In [83]:
#set to Nan 
trip_stations.loc[out["index"].values,'duration_to_next']= np.NaN
trip_stations.loc[out['index'].values, 'target_stop_id'] = np.NaN

#### export edges

In [85]:
edges = trip_stations.drop(out["index"].values)

In [86]:
edges = edges[[ 'stop_id', 'target_stop_id', 'trip_id', 'departure_time', 'route_short_name', 'duration_to_next']]

In [87]:
# Write the edges to a CSV file
edges.to_csv("edges-new.csv", index=False)