In [1]:
import matplotlib

import numpy as np
import pandas as pd
import geopandas as gpd

from zipfile import ZipFile

In [2]:
# required bbox including all of Massachusetts and RI as well as parts of NH, CT, NY
bbox = (-73.7207, 41.1198, -69.7876, 43.1161)
# path to the downloaded and cleaned gtfs - mbta recap file for fall 2018
#   this could also be a folder of gtfs folders (pre merge of multiple gtfs)

path_to_gtfs = r"J:\Shared drives\TMD_TSA\Model\networks\Transit\gtfs\bat_2019\gtfs_zip"
out_path = r"J:\Shared drives\TMD_TSA\Model\networks\Transit\gtfs\bat_2019\1_gtfs_r"

In [8]:
z = ZipFile(path_to_gtfs+r"\gtfs.zip", 'r')

timetables = pd.read_csv(z.open("timetables.txt"))
timetable_stop_order = pd.read_csv(z.open("timetable_stop_order.txt"))
routes = pd.read_csv(z.open("routes.txt"))
trips = pd.read_csv(z.open("trips.txt"))
shapes = pd.read_csv(z.open("shapes.txt"))
stops = pd.read_csv(z.open("stops.txt"))
stop_times = pd.read_csv(z.open("stop_times.txt"), low_memory=False)
calendar = pd.read_csv(z.open("calendar.txt"))
calendar_dates = pd.read_csv(z.open("calendar_dates.txt"))
calendar_attributes = pd.read_csv(z.open("calendar_attributes.txt"))
agency = pd.read_csv(z.open("agency.txt"))
directions = pd.read_csv(z.open("directions.txt"))
transfers = pd.read_csv(z.open("transfers.txt"))
z.close()

In [9]:
cal = calendar.query('(start_date < 20181024) & (end_date > 20181024) & (monday+tuesday+wednesday+thursday+friday > 3)')
cal_da = calendar_dates.query('service_id in @cal.service_id')
cal_att = calendar_attributes.query('service_id in @cal.service_id')

trip_filt = trips.query('service_id in @cal.service_id')

In [10]:
st_filt = stop_times.query('trip_id in @trip_filt.trip_id')
stops_filt = stops.query('stop_id in @st_filt.stop_id')
shapes_filt = shapes.query('shape_id in @trip_filt.shape_id')
routes_filt = routes.query('route_id in @trip_filt.route_id')

In [11]:
if 'route_pattern_id' not in trip_filt.columns:
    trip_filt.loc[:,'route_pattern_id'] = trip_filt['shape_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_filt.loc[:,'route_pattern_id'] = trip_filt['shape_id']


In [12]:
trip_filt = trip_filt.merge(routes_filt[['route_id','route_long_name']], how='left',on='route_id')
trip_filt['trip_headsign'] = np.where(trip_filt['trip_headsign'].isna(), trip_filt['route_long_name'], trip_filt['trip_headsign'])
trip_filt['direction_id'] = np.where(trip_filt['direction_id'].isna(), 2, trip_filt['direction_id'])
trip_filt = trip_filt[['trip_id','route_id','service_id','trip_short_name','trip_headsign','direction_id','block_id','shape_id','bikes_allowed','wheelchair_accessible','route_pattern_id']]

In [13]:
cal.to_csv(out_path+r"\calendar.txt", index=False)
cal_da.to_csv(out_path+r"\calendar_dates.txt", index=False)
cal_att.to_csv(out_path+r"\calendar_attributes.txt", index=False)
trip_filt.to_csv(out_path+r"\trips.txt", index=False)
st_filt.to_csv(out_path+r"\stop_times.txt", index=False)
stops_filt.to_csv(out_path+r"\stops.txt", index=False)
shapes_filt.to_csv(out_path+r"\shapes.txt", index=False)
routes_filt.to_csv(out_path+r"\routes.txt", index=False)
agency.to_csv(out_path+r"\agency.txt", index=False)
directions.to_csv(out_path+r"\directions.txt", index=False)
transfers.to_csv(out_path+r"\transfers.txt", index=False)

### Experiment

In [None]:
tts = timetables.query('(start_date < 20181024) & (end_date > 20181024) & (service_notes in @cal.service_name)')
ttso = timetable_stop_order.query('timetable_id in @tts.timetable_id')

In [None]:
st = stop_times.query('trip_id in @trip_filt.trip_id')

In [None]:
ttso_tts = ttso.merge(tts[['timetable_id','route_id','direction_id','service_notes','timetable_label']], how='left',on='timetable_id')
ttso_tts = ttso_tts.merge(
    cal[['service_id','service_name']], 
    how='left',
    left_on='service_notes',
    right_on='service_name')[['timetable_id','stop_id','stop_sequence','route_id','direction_id','timetable_label','service_id']]
ttso_tts.groupby('route_id')['timetable_id'].nunique()

In [None]:
stop_filt = stops.query('stop_id in @ttso.stop_id')

In [None]:
sts = stop_times.merge(trip_filt[['route_id','service_id','trip_id','trip_short_name','trip_headsign','direction_id','block_id','shape_id']],how='left', on='trip_id')
sts = sts[['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'route_id', 'service_id',
       'trip_short_name', 'trip_headsign', 'direction_id', 'block_id',
       'shape_id']]
sts = sts.query('~route_id.isna()')

In [None]:
sts.query('stop_sequence == 1').groupby('route_id')['trip_id'].nunique()

In [None]:
combo = sts.merge(ttso_tts, how='outer', on = ['stop_id','stop_sequence','route_id','service_id','direction_id']).sort_values(by=['trip_id','stop_sequence'])
combo

In [None]:
nondisplay_trips = combo.query('timetable_id.isna()')['trip_id'].unique()
trip_filt.query('trip_id not in @nondisplay_trips')['shape_id'].unique()

In [None]:
trip_filt.query('trip_id not in @nondisplay_trips')['trip_id'].nunique()

In [None]:
combo.query('~timetable_id.isna()')['trip_id'].nunique()

In [None]:
trip_filt['trip_id'].nunique()

In [None]:
trips['trip_id'].nunique()

In [None]:
trip_filt['shape_id'].unique()

In [None]:
trips['shape_id'].nunique()