In [1]:
import pandas as pd
import numpy as np

# format sig figs
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [80]:
OUTPUT_DIR = r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.1_stochastic_iter2_nocap_30000'

# obs_links_dir = r'R:\FastTrips\FT Repo\All input & output files\OBS_FToutput.csv'
obs_links_dir = r'..\data\obs\obs_links.csv'

# Probability threshold for observed paths (are observed paths assigned above/below this value by fast-trips)
threshold = 0.3

# Compare observed path to pathset based on modes, agency, or route
# comparison_field = 'path_modes'
comparison_field = 'path_agencies'
# comparison_field = 'path_routes'

non_transit_modes = ['transfer','walk_access','walk_egress','bike_access','bike_egress',
                     'PNR_access','PNR_egress','KNR_access','KNR_egress']
transit_mode_list = ['local_bus','commuter_rail','express_bus','ferry','heavy_rail','light_rail','premium_bus']

In [81]:
def load_df(data, unique_fields, record_type=None):
    '''Load text data as df, create unique trip record ID, and tag as model/observed record'''
    df = pd.read_csv(data)
    if record_type != None:
        df['record_type'] = record_type    # tag as model/observed record

    # Convert all specified unique_fields to string and concatenate as new unique_id field 
#     df[unique_fields] = pd.DataFrame([df[col].astype('int').astype('str') for col in unique_fields]).T
#     df['unique_id'] = df[unique_fields].apply(lambda x: '_'.join(x), axis=1)
    df['unique_id'] = df['person_id']
    
    return df

In [82]:
def append(*args):
    '''Union dataframes with similar structures'''
    df = pd.DataFrame()
    for data in args:
        df = df.append(data)

    return df

In [83]:
def select_common_records(df1,df2,field):
    '''Return dataframe of matching, common records only.
       Example, person 1034 exists in df1, but not in df2, so new copy of df1 without 1034 is created
    '''
    df1 = df1[df1[field].isin(df2[field])]
    df2 = df2[df2[field].isin(df1[field])]

    return df1, df2

In [84]:
def add_transit_agency(df, routes):

    df = pd.merge(left=df,right=routes[['route_id','agency_id']],on='route_id',how='left')

    df['agency'] = df['agency_id']
    df.drop('agency_id',axis=1)
    df.fillna("",inplace=True)
    df.reset_index(inplace=True)

    return df

In [85]:
def produce_path_fields(df, group):
    '''
    Concatenate set of fields for pathset_links, e.g. ('bart caltrain') for 2-leg transit trip
    Produce concatenated fields for routes, modes, agencies, all components (stops, modes, & routes)
    '''
    # create "path_routes"

    for field in ['route_id','mode','agency','A_id','B_id']:
        df[field] = df[field].astype('str')
        df[field] = df[field].fillna("")
        df[field] = df[field].replace('nan',"")

    df['path_routes'] = df['route_id'].apply(lambda x: x.strip())
    path_routes = pd.DataFrame(df.groupby(group)['path_routes'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    result_df = pd.DataFrame(index=path_routes.index)
    result_df['path_routes'] = path_routes
    
    # create "path_modes"
    df['path_modes'] = df['mode'].apply(lambda x: x.strip())
    result_df['path_modes'] = pd.DataFrame(df.groupby(group)['mode'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # create "path_agencies"
    df['path_agencies'] = df['agency'].apply(lambda x: x.strip())
    result_df['path_agencies'] = pd.DataFrame(df.groupby(group)['agency'].apply(lambda x: "%s" % ' '.join(x).strip()))

    # Create "path_components"
    df['path_components'] = df['A_id']+" "+df['mode']+" "+df['route_id'] +"_"+ df['B_id']
    df['path_components'] = df['path_components'].apply(lambda x: x.strip())
    result_df['path_components'] = pd.DataFrame(df.groupby(group)['path_components'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # stop components
    df['A_id'] = df['A_id'].astype('str')
    df['B_id'] = df['B_id'].astype('str')
    df['path_stops'] = df['A_id'] +" "+df['B_id']
    df['path_stops'] = df['path_stops'].apply(lambda x: x.strip())
    result_df['path_stops'] = pd.DataFrame(df.groupby(group)['path_stops'].apply(lambda x: "%s" % ' '.join(x).strip()))
    # drop repeated records (A_id and B_id overlap as origin and destination nodes for subsequent trips)
    result_df['path_stops'] = result_df['path_stops'].apply(lambda row: np.unique(row.split(' ')))
    # write out as space separated field
    result_df['path_stops'] = result_df['path_stops'].apply(lambda x: "%s" % ' '.join(x).strip())
    
    # Return ID field from index
    result_df['unique_id'] = result_df.index.get_level_values(0).values

    return result_df

# Load Data

In [98]:
routes = pd.read_csv(r'../data/gtfs/agency_route_1.9.csv')

# Load observed and chosenpath_links; add new field designating 'model' or 'observed'
obs = load_df(data=obs_links_dir, unique_fields=['person_id','trip_list_id_num'], record_type='observed')

In [102]:
# Fill floats with integers where available, specifically for stop IDs
obs['A_id'] = obs['A_id'].fillna(0).astype('int')
obs['B_id'] = obs['B_id'].fillna(0).astype('int')

In [103]:
chosenpath_links = load_df(data=OUTPUT_DIR + r'\chosenpaths_links.csv', 
    unique_fields=['person_id','trip_list_id_num'], record_type='model', )

In [11]:
len(chosenpath_links)

12916658

In [104]:
# For some reason there are a lot of duplicates
# let's drop them for now
chosenpath_links = chosenpath_links.drop_duplicates()

# Get the last iteration only
chosenpath_links = chosenpath_links[chosenpath_links['iteration'] == chosenpath_links['iteration'].max()]

In [105]:
# Add transit agency field to chosenpath_links and pathset_links, based on route_id
chosenpath_links = add_transit_agency(df=chosenpath_links, routes=routes)

In [107]:
chosenpath_links['A_id'] = chosenpath_links['A_id'].fillna(0).astype('int')
chosenpath_links['B_id'] = chosenpath_links['B_id'].fillna(0).astype('int')

In [108]:
observed_path = produce_path_fields(obs, group=['unique_id'])
modeled_path = produce_path_fields(chosenpath_links, group=['unique_id'])

# Make sure we only evaluate the overlapping unique_id records
observed_path = observed_path[observed_path['unique_id'].isin(modeled_path['unique_id'].values)]
modeled_path = modeled_path[modeled_path['unique_id'].isin(observed_path['unique_id'].values)]

# Build comparison fields for pathset_links
- from the description column in pathset_paths

In [111]:
def mode_list(row):

    row_array = row.split(' ')
    
    mode_results = []
    
    for field in row_array:
        if field in transit_mode_list + ['transfer','walk_access','walk_egress']:
            mode_results.append(field)

    # convert from list into space-seperated string
    mode_results = ' '.join(mode_results).strip()

    return mode_results

In [112]:
def route_list(row):
    """
    if route_only=True, return only the route id from the trip id prepended with route info
    otherwise return full route_id_xyz, where xyz is the trip id
    """
    
    row_array = row.split(' ')
    
    route_results = []
    
    for i in xrange(len(row_array)):
        if row_array[i] in transit_mode_list:
            route_results.append("_".join(row_array[i+1].split('_')[:-1]))     # Drop last component of field (trip ID)
#                 route_results.append(row_array[i+1])
            
    route_results =  ' '.join(route_results).strip()
    
    return route_results

In [113]:
def stop_list(row):
    
    row_array = row.split(' ')
    
    # Get the list of transit
    trip_with_route_list = []
    stop_results = []
    
    for i in xrange(len(row_array)):
        if row_array[i] in transit_mode_list:
            # Save this as a field we don't want to include as a stop
            trip_with_route_list.append(row_array[i+1])
        if row_array[i] not in trip_with_route_list+transit_mode_list+['transfer','walk_access','walk_egress']:
            stop_results.append(row_array[i])
    
    stop_results = ' '.join(stop_results).strip()
    
    return stop_results

In [114]:
def agency_list(row):
    
    agency_results = []
    
    # Get list of agencies associated with each transit route
    row_array = row.split(' ')
    
    for route_id in row_array:
        agency_results.append(routes[routes['route_id'] == route_id]['agency_id'].values[0])
    
    agency_results = ' '.join(agency_results).strip()
    
    return agency_results

In [115]:
# Mode paths
pathset_paths['path_modes'] = pathset_paths['description'].apply(lambda row: mode_list(row))

In [116]:
# Route paths
pathset_paths['path_routes'] = pathset_paths['description'].apply(lambda row: route_list(row))

In [117]:
# Stops
pathset_paths['path_stops'] = pathset_paths['description'].apply(lambda row: stop_list(row))

In [118]:
# Agencies
pathset_paths['path_agencies'] = pathset_paths['path_routes'].apply(lambda row: agency_list(row))

In [119]:
# Create a stacked csv of observed trip links & model chosenpath_links; export for Tableau
chosenpath_links, obs = select_common_records(chosenpath_links, obs,'person_id')
chosenpaths_links_with_observed = append(chosenpath_links, obs)

In [120]:
# # add additional info about the transfer from and to route/agency for tableau maps
chosenpaths_links_with_observed['transfer_from_agency'] = chosenpaths_links_with_observed['agency'].shift(1)
chosenpaths_links_with_observed['transfer_to_agency'] = chosenpaths_links_with_observed['agency'].shift(-1)

chosenpaths_links_with_observed['transfer_from_route'] = chosenpaths_links_with_observed['route_id'].shift(1)
chosenpaths_links_with_observed['transfer_to_route'] = chosenpaths_links_with_observed['route_id'].shift(-1)

len(chosenpaths_links_with_observed)

80814

In [121]:
chosenpaths_links_with_observed[['A_id','B_id']] = chosenpaths_links_with_observed[['A_id','B_id']].replace('',0)

In [122]:
# Add geography too
# Get distance from TAZ to stop
walk_access_ft = pd.read_csv(r'R:\FastTrips\network_draft1.9\walk_access_ft.txt')[['taz','stop_id','dist']]

chosenpaths_links_with_observed[['A_id','B_id']] = chosenpaths_links_with_observed[['A_id','B_id']].astype('float')

chosenpaths_links_with_observed = pd.merge(chosenpaths_links_with_observed, walk_access_ft, 
                                           left_on=['A_id','B_id'],right_on=['taz','stop_id'],how='left')


chosenpaths_links_with_observed['walk_access_dist'] = chosenpaths_links_with_observed['dist']

# # Now join for distance from stop to TAZ (egress)
chosenpaths_links_with_observed.drop(['taz','stop_id','dist'],axis=1,inplace=True)
chosenpaths_links_with_observed = pd.merge(chosenpaths_links_with_observed, walk_access_ft, 
                                           left_on=['A_id','B_id'],right_on=['stop_id','taz'],how='left')
chosenpaths_links_with_observed['walk_egress_dist'] = chosenpaths_links_with_observed['dist']
chosenpaths_links_with_observed.drop(['taz','stop_id','dist'],axis=1,inplace=True)

In [123]:
len(chosenpaths_links_with_observed)

80814

In [124]:
# lat-long where it's missing for stops
stops = pd.read_csv(r'R:\FastTrips\network_draft1.9\stops.txt')

In [125]:
chosenpaths_links_with_observed = pd.merge(chosenpaths_links_with_observed,stops,left_on='A_id',right_on='stop_id',how='left')

In [126]:
chosenpaths_links_with_observed['stop_lat']

0         nan
1    37.56974
2    37.44334
3    37.44334
4    37.44442
5         nan
6    37.75156
7    37.61499
8         nan
9    37.77638
10   37.58103
11        nan
12   37.86529
13   37.78973
14        nan
...
80799        nan
80800   37.69265
80801   37.70127
80802        nan
80803   37.78472
80804   37.85413
80805        nan
80806   37.87262
80807   37.75156
80808        nan
80809   37.85413
80810   37.87174
80811        nan
80812   37.78893
80813   37.72115
Name: stop_lat, Length: 80814, dtype: float64

In [127]:
# Join again with TAZ since most A_id walk_access trips are TAZ
taz = pd.read_csv(r'R:\FastTrips\network_draft1.9\taz_coords.txt')

chosenpaths_links_with_observed = pd.merge(chosenpaths_links_with_observed,taz,left_on='A_id',right_on='taz',how='left')

In [128]:
# Fill the nan stop_id with lat and long fields from the TAZ merge
chosenpaths_links_with_observed.ix[pd.isnull(chosenpaths_links_with_observed['stop_id'])]['lat'] = \
    chosenpaths_links_with_observed.ix[pd.isnull(chosenpaths_links_with_observed['stop_id'])]['stop_lat']

In [129]:
chosenpaths_links_with_observed['stop_lat'].fillna(chosenpaths_links_with_observed['lat'],inplace=True)
chosenpaths_links_with_observed['stop_lon'].fillna(chosenpaths_links_with_observed['lon'],inplace=True)

In [130]:
chosenpaths_links_with_observed.to_csv(OUTPUT_DIR + '/' + 'chosenpaths_links_with_observed.csv',index=False)

# Analyze Data

In [131]:
# Combine the observed and modeled path files
df = pd.merge(observed_path, modeled_path, on='unique_id',suffixes=("_observed","_model"))

- compare routes, modes, and agencies

In [132]:
# Build list of routes used in observed and modeled trips
df['model_path_route_list'] = df['path_routes_model'].apply(lambda x: x.split(" "))
df['obs_path_route_list'] = df['path_routes_observed'].apply(lambda x: x.split(" "))

# Build list of modes used in observed and modeled trips
df['model_path_mode_list'] = df['path_modes_model'].apply(lambda x: x.split(" "))
df['obs_path_mode_list'] = df['path_modes_observed'].apply(lambda x: x.split(" "))

# Build list of transit agencies used in observed and modeled trips
df['model_path_agencies_list'] = df['path_agencies_model'].apply(lambda x: x.split(" "))
df['obs_path_agencies_list'] = df['path_agencies_observed'].apply(lambda x: x.split(" "))

# Build list of stops used in observed and modeled trips
df['model_path_stops_list'] = df['path_stops_model'].apply(lambda x: x.split(" "))
df['obs_path_stops_list'] = df['path_stops_observed'].apply(lambda x: x.split(" "))


In [133]:
obs[obs['unique_id'] == '10---Caltrain---2014']

Unnamed: 0,person_id,trip_list_id_num,linkmode,A_id,B_id,linknum,mode,route_id,agency,record_type,unique_id,path_routes,path_modes,path_agencies,path_components,path_stops
3,10---Caltrain---2014,2,access,1249,14661,0,KNR_access,,,observed,10---Caltrain---2014,,KNR_access,,1249 KNR_access _14661,1249 14661
4,10---Caltrain---2014,2,transit,14661,14673,1,commuter_rail,Caltrain_Millbrae Palo Alto,caltrain,observed,10---Caltrain---2014,Caltrain_Millbrae Palo Alto,commuter_rail,caltrain,14661 commuter_rail Caltrain_Millbrae Palo Alt...,14661 14673
5,10---Caltrain---2014,2,egress,14673,1356,2,walk_egress,,,observed,10---Caltrain---2014,,walk_egress,,14673 walk_egress _1356,14673 1356


In [134]:
chosenpath_links[chosenpath_links['unique_id'] == '10---Caltrain---2014'].columns

Index([u'index', u'person_id', u'person_trip_id', u'trip_list_id_num', u'pf_iteration', u'pathnum', u'linkmode', u'trip_id_num', u'A_id_num', u'B_id_num', u'A_seq', u'B_seq', u'pf_A_time', u'pf_B_time', u'pf_linktime min', u'pf_waittime min', u'linknum', u'A_id', u'B_id', u'A_lat', u'A_lon', u'B_lat', u'B_lon', u'trip_id', u'route_id', u'mode_num', u'mode', u'distance', u'chosen', u'bump_iter', u'bumpstop_boarded', u'alight_delay_min', u'new_A_time', u'new_B_time', u'new_linktime min', u'new_waittime min', u'missed_xfer', u'sim_cost', u'board_time', u'overcap', u'alight_time', u'iteration', u'record_type', u'unique_id', u'agency_id', u'agency', u'path_routes', u'path_modes', u'path_agencies', u'path_components', u'path_stops'], dtype='object')

In [135]:
chosenpath_links[chosenpath_links['unique_id'] == '10---Caltrain---2014'].groupby('record_type').count()

Unnamed: 0_level_0,index,person_id,person_trip_id,trip_list_id_num,pf_iteration,pathnum,linkmode,trip_id_num,A_id_num,B_id_num,...,alight_time,iteration,unique_id,agency_id,agency,path_routes,path_modes,path_agencies,path_components,path_stops
record_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
model,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


In [136]:
 # Isolate transit modes only, because all trips should have walk & transfer components
    
df['model_transit_modes'] = df['model_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])
df['obs_transit_modes'] = df['obs_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])

In [137]:
df[['obs_transit_modes','model_transit_modes']].head()

Unnamed: 0,obs_transit_modes,model_transit_modes
0,[commuter_rail],[commuter_rail]
1,[commuter_rail],"[local_bus, local_bus]"
2,[heavy_rail],[heavy_rail]
3,"[local_bus, commuter_rail]",[commuter_rail]
4,[heavy_rail],[local_bus]


In [138]:
# Find the intersection between the chosen model/observed paths using different criteria
# Which are in common between model and observed?

# transit route IDs only
df.apply(lambda row: all(i in row['model_path_route_list'] for i in row['obs_path_route_list']), axis=1)
df['routes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_route_list'], df['obs_path_route_list'])]

# stops only
df.apply(lambda row: all(i in row['model_path_stops_list'] for i in row['obs_path_stops_list']), axis=1)
df['stops_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_stops_list'], df['obs_path_stops_list'])]

# All Modes (including transfer, access/egress)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['all_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_mode_list'], df['obs_path_mode_list'])]

# Transit modes only (type of vehicle taken and number of boardings)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['transit_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_transit_modes'], df['obs_transit_modes'])]

# Agency Intersection
df.apply(lambda row: all(i in row['model_path_agencies_list'] for i in row['obs_path_agencies_list']), axis=1)
df['agency_intersection'] = \
    [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_agencies_list'], 
        df['obs_path_agencies_list'])]

In [139]:
# Exact Match of path routes, modes, & components
# Isolate rows (trip legs) with matching path routes
complete_route_match = df[df['path_routes_observed'] == df['path_routes_model']]
complete_mode_match = df[df['path_modes_observed'] == df['path_modes_model']]
complete_agency_match = df[df['path_agencies_observed'] == df['path_agencies_model']]
complete_stop_match = df[df['path_stops_observed'] == df['path_stops_model']]

complete_route_match['complete_route_match'] = 1
complete_mode_match['complete_mode_match'] = 1
complete_agency_match['complete_agency_match'] = 1
complete_stop_match['complete_stop_match'] = 1

# Add new columns to the larger dataframe indicating if the row is a complete match
df = pd.merge(df, complete_mode_match[['unique_id','complete_mode_match']], how='left', on='unique_id')


df = pd.merge(df, complete_route_match[['unique_id','complete_route_match']], how='left', on='unique_id')
df = pd.merge(df, complete_agency_match[['unique_id','complete_agency_match']], how='left', on='unique_id')
df = pd.merge(df, complete_stop_match[['unique_id','complete_stop_match']], how='left', on='unique_id')

for field in ['mode','route','agency', 'stop']:
    df['complete_'+field+'_match']=  df['complete_'+field+'_match'].replace('nan',0)

In [140]:
# Now we find the percent of trips with matching routes or partial matching routes

# Join the filtered data to the original results
df['common_route_count'] = [len(row) for row in df['routes_intersection']]
df['common_mode_count'] = [len(row) for row in df['all_modes_intersection']]
df['common_transit_mode_count'] = [len(row) for row in df['transit_modes_intersection']]
df['common_agency_count'] = [len(row) for row in df['agency_intersection']]
df['common_stop_count'] = [len(row) for row in df['stops_intersection']]

# How many rows have at least one mode in common?
df['partial_mode_match'] = [1 if row > 0 else 0 for row in df['common_mode_count']]
df['partial_transit_mode_match'] = [1 if row > 0 else 0 for row in df['common_transit_mode_count']]
df['partial_route_match'] = [1 if row > 0 else 0 for row in df['common_route_count']]
df['partial_agency_match'] = [1 if row > 0 else 0 for row in df['common_agency_count']]
df['partial_stop_match'] = [1 if row > 0 else 0 for row in df['common_stop_count']]

# Export
df.to_csv(OUTPUT_DIR + r'/path_intersection.csv')

- check if observed path is in pathset

In [141]:
# pathset_links


In [142]:
## Add a field to the new_pathset that lists the pathnum
# pathset_links['pathnum'] = pathset_links.index.get_level_values(1)

# Join paths based on comparison_field, as defined in script header
# Resulting df is merge of all observed paths that have a corresponding path in the pathset for their unique_id
# observed paths with no corresponding path in pathset will have NaN for fields "_pathset" suffix
newdf = pd.merge(observed_path, pathset_paths, how='left',
          left_on=['unique_id',comparison_field],right_on=['unique_id',comparison_field], suffixes=['_obs','_pathset'])

# Join this data with the pathset path file to get pf_probability associated with the observed path
# newdf = pd.merge(df, pathset_paths[['unique_id','pathnum','pf_probability']], 
#                  left_on=['unique_id','pathnum'], right_on=['unique_id','pathnum'],
#                  how='left')

# Fields with NaN marked as no match since no matching path was found in pathset
newdf['probability'] = newdf['pf_probability'].fillna('no_match')

# Grab the highest and lowest probabilities from pathset paths
# want to test that the observed path has a reasonably high probability
max_prob = newdf.groupby('unique_id').max()['probability']
min_prob = newdf.groupby('unique_id').min()['probability']

# Reshape those results and export to dataframe
prob_export = pd.DataFrame([max_prob,min_prob]).T
prob_export.columns = ['max_prob','min_prob']


# Reformat at binary to indicate whether a path was found in the pathset
prob_export['path_exists'] = prob_export['max_prob'].apply(lambda row_value: 0 if row_value == 'no_match' else 1)
prob_export.to_csv('temp_prob_export.csv')

# Create a variabale to indicate if the max probability of the observed path
# is over a given threshold, as defined at top of script
try:
    prob_export.ix[prob_export['max_prob'] >= threshold, 'above_threshold'] = 1
    prob_export.ix[prob_export['max_prob'] < threshold, 'above_threshold'] = 0
    prob_export.ix[prob_export['max_prob'] == 'no_match', 'above_threshold'] = -1
except:
    pass

# export the results probabilities, path_existence, threshold data,
# also add the modeled and observed (chosen) path characteristics
prob_export['unique_id'] = prob_export.index

tempdf = pd.merge(observed_path,modeled_path,on='unique_id',suffixes=['_obs','_model'],how='left')
export_df = pd.merge(prob_export,tempdf,on='unique_id',how='left')

export_df['person_id'] = export_df['unique_id'].apply(lambda row: row.split("_")[0])
export_df['trip_list_id_num'] = export_df['unique_id'].apply(lambda row: row.split("_")[-1])

export_df.to_csv(OUTPUT_DIR + '\path_comparison.csv', index=False)

In [143]:
export_df['path_exists'].mean()

0.33905065815715996

In [144]:
export_df[export_df['max_prob'] != 'no_match'].mean()

path_exists       1.00000
above_threshold   0.49794
dtype: float64

In [145]:
df['complete_route_match'].mean()

0.010271240526525728

In [148]:
df['complete_stop_match'].mean()

0.19315915436777023

In [147]:
df['complete_agency_match'].mean()

0.2847028320702034