In [25]:
import time
import datetime

import matplotlib
matplotlib.use('agg')  # allows notebook to be tested in Travis

import numpy as np
import pandas as pd
import cartopy.crs as ccrs
import cartopy
import matplotlib.pyplot as plt
import pandana as pdna
import time

import urbanaccess as ua
from urbanaccess.config import settings
from urbanaccess.gtfsfeeds import feeds
from urbanaccess import gtfsfeeds
from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs
from urbanaccess.network import ua_network, load_network

from tqdm import tqdm

%matplotlib inline

In [26]:
# required bbox including all of Massachusetts and RI as well as parts of NH, CT, NY
bbox = (-73.7207, 41.1198, -69.7876, 43.1161)
# path to the downloaded and cleaned gtfs - mbta recap file for fall 2018
#   this could also be a folder of gtfs folders (pre merge of multiple gtfs)

path_to_gtfs = r"J:\Shared drives\TMD_TSA\Model\networks\Transit\gtfs\bnrd\1_gtfs_r"


In [27]:
loaded_feeds = ua.gtfs.load.gtfsfeed_to_df(gtfsfeed_path= path_to_gtfs,
                                           validation=True,
                                           verbose=True,
                                           bbox=bbox,
                                           remove_stops_outsidebbox=False,
                                           append_definitions=True)

Checking GTFS text file header whitespace... Reading files using encoding: utf-8 set in configuration.
GTFS text file header whitespace check completed. Took 0.46 seconds
--------------------------------
Processing GTFS feed: 1_gtfs_r
The unique agency id: mbta was generated using the name of the agency in the agency.txt file.
Unique agency id operation complete. Took 0.02 seconds
Unique GTFS feed id operation complete. Took 0.00 seconds
No GTFS feed stops were found to be outside the bounding box coordinates
1_gtfs_r GTFS feed stops: coordinates are in northwest hemisphere. Latitude = North (90); Longitude = West (-90).
Appended route type to stops
Appended route type to stop_times
--------------------------------
Added descriptive definitions to stops, routes, stop_times, and trips tables
Successfully converted ['departure_time'] to seconds past midnight and appended new columns to stop_times. Took 1.07 seconds
1 GTFS feed file(s) successfully read as dataframes:
     1_gtfs_r
     T

### Functions

In [28]:
def get_start_stop_times(stop_times):    
    '''for every trip, grab the start time and stop time of the trip
    
    Parameters
    -----------
    stop_times : df
        gtfs stop_times.txt in df format

    Returns
    --------
        flintstone : df
            df with start and stop times per trip

    '''
    chocula =0 
    for trip_id in stop_times['trip_id'].unique():
        max_row = stop_times.query('trip_id==@trip_id').query('stop_sequence == stop_sequence.max()')[['trip_id','arrival_time']]
        min_row = stop_times.query('trip_id==@trip_id').query('stop_sequence == stop_sequence.min()')[['trip_id','arrival_time']]
        r2 = min_row.merge(max_row, how='left', on='trip_id', suffixes = ('_start','_end'))
        if chocula == 0:
            flintstone = pd.DataFrame(r2)
        else:
            flintstone=pd.concat([flintstone,r2])
        chocula +=1
    return(flintstone)


In [29]:
def assign_tod(start_stop):
    '''calculate midpoint of trip, use midpoint to assign TOD
    
    Parameters
    -----------
    start_stop : df
        df with start and stop times per trip

    Returns
    --------
    start_stop :
        df with start time, stop time, midpoint time, and TOD per trip

    '''
    
    start_stop['at_end_dec'] = (
        (
            (start_stop['arrival_time_end'].str.split(":").str[0]).astype('int32')
            +
            ((start_stop['arrival_time_end'].str.split(":").str[1]
            ).astype('int32')/60)))
    start_stop['at_start_dec'] = (
        (
            (start_stop['arrival_time_start'].str.split(":").str[0]).astype('int32')
            +
            ((start_stop['arrival_time_start'].str.split(":").str[1]
            ).astype('int32')/60)))
    
    start_stop['midpoint'] = start_stop['at_start_dec'] + ((start_stop['at_end_dec']-start_stop['at_start_dec'])/2)
    start_stop['tod'] = np.where(start_stop['midpoint'].between(6.50,9.50),'AM', np.where(
        start_stop['midpoint'].between(9.50,15.00), 'MD', np.where(
            start_stop['midpoint'].between(15.00,19.00),'PM', 'NT' 
        )
            ) 
        )
    
    return start_stop


In [30]:
start_stop = get_start_stop_times(gtfsfeeds_dfs.stop_times)

In [31]:
start_stop_tod = assign_tod(start_stop) # smurf

### Create Summaries Step 1

In [32]:
gtfsfeeds_dfs.trips['route_pattern_id'] = np.where(gtfsfeeds_dfs.trips['route_pattern_id'].isna(),gtfsfeeds_dfs.trips['shape_id'],gtfsfeeds_dfs.trips['route_pattern_id'])
gtfsfeeds_dfs.trips['trip_headsign'] = np.where(gtfsfeeds_dfs.trips['trip_headsign'].isna(),gtfsfeeds_dfs.trips['direction_id'],gtfsfeeds_dfs.trips['trip_headsign'])

In [33]:
nobuscr_routes = gtfsfeeds_dfs.routes.query('(route_type not in [2,3])')['route_id']
sls = gtfsfeeds_dfs.routes.query('route_short_name in ["SL1","SL2","SL3","SL5"]')['route_id']

In [24]:
gtfsfeeds_dfs.trips.query('route_id in @selected_routes')['route_pattern_id'].unique()

array(['Blue-6-0', 'Blue-8-0', 'Blue-6-1', 'Mattapan-_-0', 'Mattapan-_-1',
       'Orange-3-0', 'Orange-5-0', 'Orange-3-1', 'Red-1-0', 'Red-3-0',
       'Red-1-1', 'Red-3-1', 'Green-B-812-0', 'Green-B-812-1',
       'Green-C-832-0', 'Green-C-832-1', 'Green-D-851-0', 'Green-D-855-0',
       'Green-D-851-1', 'Green-D-855-1', 'Green-E-885-0', 'Green-E-885-1',
       'Boat-F1-1-0', 'Boat-F1-2-0', 'Boat-F1-3-0', 'Boat-F1-6-0',
       'Boat-F1-9-0', 'Boat-F1-1-1', 'Boat-F1-2-1', 'Boat-F1-3-1',
       'Boat-F1-4-1', 'Boat-F1-6-1', 'Boat-F4-0-0', 'Boat-F4-0-1'],
      dtype=object)

In [10]:
selected_routes = sls

In [11]:
# REDO
patterns = []
# flag the first stop in every trip
gtfsfeeds_dfs.stop_times['first_stop'] =  0
gtfsfeeds_dfs.stop_times.loc[gtfsfeeds_dfs.stop_times.groupby('trip_id').stop_sequence.idxmin(),'first_stop'] = 1

#chocula = 0
rpid_stop_dict = {}
# for each route_pattern_id calculate the most common stop pattern
for rpid in tqdm(gtfsfeeds_dfs.trips.query('route_id in @selected_routes')['route_pattern_id'].unique()):
    #patterns = []
    rpid_trips = gtfsfeeds_dfs.trips.query('route_pattern_id == @rpid')['trip_id'].to_list()
    rpid_stops = gtfsfeeds_dfs.stop_times.query('@rpid_trips in trip_id').sort_values('stop_sequence')
    
    # for every trip, get stop pattern and save into a list of lists
    for tid in rpid_stops['trip_id']:
        patterns.append([tid,', '.join(rpid_stops.query('trip_id == @tid')['stop_id'].to_list())])
    
# make data frame of every trip and its pattern of stops
df = pd.DataFrame(patterns, columns = ['trip_id','stop_pattern'])

100%|██████████| 34/34 [01:01<00:00,  1.80s/it]


In [12]:
df2 = df.merge(
    gtfsfeeds_dfs.trips, 
    on='trip_id')[['trip_id','route_id', 'route_pattern_id', 'trip_short_name','trip_headsign','service_id', 'stop_pattern']]
df3 = df2.merge(start_stop_tod, on='trip_id')
df4 = df3.groupby(by=['route_pattern_id','stop_pattern', 'trip_id','tod']).first()

#df4.to_csv(r"C:\Users\matkinson.AD\Downloads\trip_id_to_stop_pattern_filtered_w_tod.csv")

### Create Summaries Step 2

In [13]:
#gtfsfeeds_dfs.stops.query('stop_id in ')
df4['stop_names_pattern'] = pd.NA
df4 = df4.reset_index()

stops = {}
for a in df4.stop_pattern:
    x = a.strip('').split(', ')
    q = []
    for y in x:
        z = gtfsfeeds_dfs.stops.query('stop_id == @y')['stop_name'].to_list()[0]
        q.append(z)
        stops[y] = z
    filt = (df4['stop_pattern'] == a)
    df4.loc[filt,'stop_names_pattern'] = str(q)
    

In [14]:
df4['stop_names_pattern'] = df4['stop_names_pattern'].str.strip("[']").str.split("', '").apply(set)
df4[['route_pattern_id','stop_pattern','stop_names_pattern']]

Unnamed: 0,route_pattern_id,stop_pattern,stop_names_pattern
0,Blue-6-0,"70059, 70057, 70055, 70053, 70051, 70049, 7004...","{Orient Heights, Bowdoin, Wood Island, Wonderl..."
1,Blue-6-0,"70059, 70057, 70055, 70053, 70051, 70049, 7004...","{Orient Heights, Bowdoin, Wood Island, Wonderl..."
2,Blue-6-0,"70059, 70057, 70055, 70053, 70051, 70049, 7004...","{Orient Heights, Bowdoin, Wood Island, Wonderl..."
3,Blue-6-0,"70059, 70057, 70055, 70053, 70051, 70049, 7004...","{Orient Heights, Bowdoin, Wood Island, Wonderl..."
4,Blue-6-0,"70059, 70057, 70055, 70053, 70051, 70049, 7004...","{Orient Heights, Bowdoin, Wood Island, Wonderl..."
...,...,...,...
2713,Red-3-1,"70105, 70104, 70102, 70100, 70098, 70096, 7008...","{Park Street, JFK/UMass, Andrew, Harvard, Davi..."
2714,Red-3-1,"70105, 70104, 70102, 70100, 70098, 70096, 7008...","{Park Street, JFK/UMass, Andrew, Harvard, Davi..."
2715,Red-3-1,"70105, 70104, 70102, 70100, 70098, 70096, 7008...","{Park Street, JFK/UMass, Andrew, Harvard, Davi..."
2716,Red-3-1,"70105, 70104, 70102, 70100, 70098, 70096, 7008...","{Park Street, JFK/UMass, Andrew, Harvard, Davi..."


In [15]:
fred = df4 #pd.read_csv(r"C:\Users\matkinson.AD\Downloads\trip_id_to_stop_pattern_filtered_w_tod.csv")

In [16]:
george = fred.set_index(['trip_id','route_pattern_id'])['stop_pattern'].str.split(',').apply(set)
harry = george.reset_index().groupby('route_pattern_id').apply(lambda x: set.union(*x.stop_pattern)).reset_index()

gg = fred.set_index(['trip_id','route_pattern_id'])['stop_names_pattern']
hh = gg.reset_index().groupby('route_pattern_id').apply(lambda x: set.union(*x.stop_names_pattern)).reset_index()
hh= hh.merge(harry, on='route_pattern_id', suffixes=['_names','_ids'])
harry

Unnamed: 0,route_pattern_id,0
0,Blue-6-0,"{ 70043, 70039, 70838, 70051, 70053, 7004..."
1,Blue-6-1,"{ 70044, 70056, 70040, 70046, 70050, 7005..."
2,Blue-8-0,"{ 70049, 70051, 70043, 70047, 70039, 70838..."
3,Boat-F1-1-0,"{ Boat-Hingham, Boat-Long, Boat-Hull}"
4,Boat-F1-1-1,"{Boat-Hingham, Boat-Long, Boat-Hull}"
5,Boat-F1-2-0,"{ Boat-Hingham, Boat-Rowes}"
6,Boat-F1-2-1,"{ Boat-Rowes, Boat-Hingham}"
7,Boat-F1-3-0,"{Boat-Long, Boat-Hull}"
8,Boat-F1-3-1,"{Boat-Hull, Boat-Long}"
9,Boat-F1-4-1,"{Boat-Hingham, Boat-Long, Boat-Logan}"


In [17]:
stops_dict = {}
for rpid in harry['route_pattern_id']:
    ron = list(harry.query('route_pattern_id == @rpid')[0].to_list()[0])
    ron = [item.strip() for item in ron]

    trips_tod = fred.query('route_pattern_id == @rpid')[['trip_id','tod']]

    hermy = pd.DataFrame(data = trips_tod, columns = ['trip_id','tod']+ron)
    hermy = hermy.set_index('trip_id')

    for trip in hermy.index:
        for sp in fred.query('trip_id == @trip').set_index('trip_id')['stop_pattern'].str.split(',').reset_index()['stop_pattern']:
            #print(sp)
            for stop in sp:
                stop = stop.strip()
                #print(stop)
                if stop in hermy.columns:
                    hermy.at[trip,stop] = 1

    hermy = hermy.reset_index().merge(fred[['trip_id','route_pattern_id','stop_pattern']].query('route_pattern_id == @rpid'), on='trip_id')
    stops_dict[rpid] = hermy


In [18]:
for key in stops_dict:
    a =stops_dict[key].groupby(by=['route_pattern_id','tod','stop_pattern']).count().reset_index()
    
    b = a.groupby(['stop_pattern','route_pattern_id']).first().drop(['tod','trip_id'], axis=1)
    c = b/b
    a = a.pivot(index=['route_pattern_id','stop_pattern'], columns=['tod'], values = 'trip_id').reset_index()

    tod_list = [x for x in ["AM","MD","PM","NT"] if x not in a.columns.to_list()]
    if len(tod_list) > 0:
        a[tod_list] = 0

    subaru = gtfsfeeds_dfs.stop_times.merge(stops_dict[key][['route_pattern_id','trip_id',]], on='trip_id')[['route_pattern_id','stop_id','stop_sequence']]
    sub_stops = subaru.groupby(by=['stop_id','stop_sequence']).count().reset_index().groupby(by=['stop_id']).agg({'stop_sequence':'max'}).sort_values(by = 'stop_sequence')

    stops_dict[key] = a.merge(c[sub_stops.index.to_list()].reset_index(), on=['route_pattern_id','stop_pattern'])

key_dict = {}
for key in stops_dict:
    key_parts = key.split('-')
    if len(key_parts) > 2:
        key_parts.pop(2)
        partial = '-'.join(key_parts)
    else:
        partial=key_parts[0]

    if partial in key_dict.keys():
        key_dict[partial].append(key)
    else:
        key_dict[partial] = [key]

In [19]:
final_tables_dict = {}
#summarized key in dictionary of keys
for key in key_dict:
    # if more than one real key associated with the summarized key
    if len(key_dict[key]) > 1:
        fred1 = []
        # for each real key, grab table, and append to list
        for x in range(len(key_dict[key])):
            heebee = stops_dict[key_dict[key][x]].loc[:,~stops_dict[key_dict[key][x]].columns.duplicated()]
            fred1.append(heebee)
        
        # sort tables by number of columns
        
        # concat all real key tables per summarized key
        gee = pd.concat(fred1, axis = 0)
        print(gee)
    else:
        gee = stops_dict[key_dict[key][0]]
    
    final_tables_dict[key] = gee

  route_pattern_id                                       stop_pattern  AM  MD  \
0         Blue-6-0  70059, 70057, 70055, 70053, 70051, 70049, 7004...  36  44   
0         Blue-6-1  70038, 70040, 70042, 70044, 70046, 70048, 7005...  36  44   

   NT  PM  70059  70057  70055  70053  ...  70042  70044  70046  70048  70050  \
0  53  47    1.0    1.0    1.0    1.0  ...    NaN    NaN    NaN    NaN    NaN   
0  52  48    NaN    NaN    NaN    NaN  ...    1.0    1.0    1.0    1.0    1.0   

   70052  70054  70056  70058  70060  
0    NaN    NaN    NaN    NaN    NaN  
0    1.0    1.0    1.0    1.0    1.0  

[2 rows x 30 columns]
  route_pattern_id                        stop_pattern  MD  NT  PM  AM  \
0      Boat-F1-1-0  Boat-Long, Boat-Hull, Boat-Hingham   2   1   4   0   
0      Boat-F1-2-0            Boat-Rowes, Boat-Hingham   1   2   6   4   
0      Boat-F1-3-0                Boat-Long, Boat-Hull   0   0   1   2   
0      Boat-F1-6-0             Boat-Long, Boat-Hingham   3   1   0   0   
0 

In [20]:
stops

{'70059': 'Wonderland',
 '70057': 'Revere Beach',
 '70055': 'Beachmont',
 '70053': 'Suffolk Downs',
 '70051': 'Orient Heights',
 '70049': 'Wood Island',
 '70047': 'Airport',
 '70045': 'Maverick',
 '70043': 'Aquarium',
 '70041': 'State',
 '70039': 'Government Center',
 '70838': 'Bowdoin',
 '70038': 'Bowdoin',
 '70040': 'Government Center',
 '70042': 'State',
 '70044': 'Aquarium',
 '70046': 'Maverick',
 '70048': 'Airport',
 '70050': 'Wood Island',
 '70052': 'Orient Heights',
 '70054': 'Suffolk Downs',
 '70056': 'Beachmont',
 '70058': 'Revere Beach',
 '70060': 'Wonderland',
 'Boat-Long': 'Long Wharf (North)',
 'Boat-Hull': 'Hull',
 'Boat-Hingham': 'Hingham',
 'Boat-Rowes': 'Rowes Wharf',
 'Boat-Logan': 'Logan Airport Ferry Terminal',
 'Boat-Long-South': 'Long Wharf (South)',
 'Boat-Charlestown': 'Charlestown',
 '70202': 'Government Center',
 '70196': 'Park Street',
 '70159': 'Boylston',
 '70157': 'Arlington',
 '70155': 'Copley',
 '70153': 'Hynes Convention Center',
 '71151': 'Kenmore',
 '

In [21]:
for key in final_tables_dict:
    for col in final_tables_dict[key]:
        if col in stops:
            final_tables_dict[key] = final_tables_dict[key].rename(columns = {col:stops[col]})

In [22]:
for key in final_tables_dict:
    final_tables_dict[key].to_csv(r"C:\Users\matkinson.AD\Downloads\\"+key+".csv")