In [1]:
import pandas as pd
import numpy as np

# format sig figs
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Combine observed & modeled trip links 
----

In [124]:
def prep_df(data, record_type, unique_fields ,colname='record_type'):
    '''Load text data as df, create unique trip record ID, and tag as model/observed record'''
    df = pd.read_csv(data)
    df[colname] = record_type    # tag as model/observed record

    # Convert all specified unique_fields to string and concatenate as new unique_id field 
    df[unique_fields] = pd.DataFrame([df[col].astype('int').astype('str') for col in unique_fields]).T
    df['unique_id'] = df[unique_fields].apply(lambda x: '_'.join(x), axis=1)

    return df

In [120]:
def append(*args):
    '''Union dataframes with similar structures'''
    df = pd.DataFrame()
    for data in args:
        df = df.append(data)

    return df

In [121]:
def select_common_records(df1,df2,field):
    '''Return dataframe of matching, common records only.
       Example, person 1034 exists in df1, but not in df2, so new copy of df1 without 1034 is created
    '''
    df1 = df1[df1[field].isin(df2[field])]
    df2 = df2[df2[field].isin(df1[field])]

    return df1, df2

## Load modeled and observed data
Create unique ID based on person-id and trip-list-id-num fields, concatentated with "_"

In [134]:
# Load the modeled and observed datasets
model_results_dir = r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\chosenpaths_links.csv'
observed_results_di = r'..\data\obs\obs_chosenpaths_links.csv'

- load data from CSV to dataframe
- add model/observed text field 
- add unique ID

In [135]:
model = prep_df(data=model_results_dir, record_type='model', unique_fields=['person_id','trip_list_id_num'])
observed = prep_df(data=observed_results_di, record_type='observed', unique_fields=['person_id','trip_list_id_num'])

In [136]:
print len(observed.groupby(['person_id','trip_list_id_num']).count()['mode'])
print len(model.groupby(['person_id','trip_list_id_num']).count()['mode'])

18131
21948


## Select unique IDs common to each dataset

In [137]:
model, observed = select_common_records(model,observed,'person_id')

In [138]:
print len(observed.groupby('unique_id').count()['person_id'])
print len(model.groupby('unique_id').count()['person_id'])

18131
18160


## Append observed data rows to modeled and export to file


In [140]:
df = append(model, observed)
df.to_csv('chosenpaths_links_with_observed.csv')

# Compare paths

In [173]:
obs = pd.read_csv(r'..\data\obs\obs_chosenpaths_links.csv')



# Fast Trips output: link-level results for the chosen path only (chosenpaths_links)
model = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\chosenpaths_links.csv')
model['route_id'].fillna("",inplace=True)
# NOTE: assume we look at only the last iteration of path sets ?
model = model[model['iteration'] == model['iteration'].max()]
model['person_id'] = model['person_id'].astype('int').astype('str')


# Fast Trips output: link-level results for ALL paths created by Fast Trips (pathset_links)
pathset_links = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\pathset_links.csv')
pathset_links['route_id'].fillna("",inplace=True)
pathset_links = pathset_links[pathset_links['iteration'] == pathset_links['iteration'].max()]

# Fast Trips output: paths-level results for all paths created by Fast Trips (pathset_paths)
pathset_paths = pd.read_csv(r'J:\Projects\FasTrips\obs\output\OBS_fasttrips_demand_v1.0_stochastic_iter2_cap\pathset_paths.csv')

In [141]:
# obs

## Reset Indeces

In [157]:
obs.reset_index(inplace=True)
model.reset_index(inplace=True)
pathset_links.reset_index(inplace=True)
pathset_paths.reset_index(inplace=True)

## Create Unique ID

In [158]:
# obs['unique_id'] = obs['person_id'].astype('str') +"_"+obs['trip_list_id_num'].astype('str')
model['unique_id'] = model['person_id'].astype('str') +"_"+model['trip_list_id_num'].astype('str')
pathset_links['unique_id'] = pathset_links['person_id'].astype('int').astype('str') +"_"+pathset_links['trip_list_id_num'].astype('str')
pathset_paths['person_id'] = pathset_paths['person_id'].astype('int').astype('str')
pathset_paths['unique_id'] = pathset_paths['person_id'].astype('str') +"_"+pathset_paths['trip_list_id_num'].astype('str')

** each unique ID should exist within the observed and the model results **

In [159]:
# Compare the unique_id fields between each data set
print len(obs.groupby('unique_id').count().index)
print len(model.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))

18131
20836
18131


Some observed trips were filtered out, but the number of modeled trips should be the same as the 
number of common unique_ids. For some reason it's less

In [160]:
# Use the common unique_ids to filter out the data
model = model[model['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]

In [164]:
print len(obs.groupby('unique_id').count().index)
print len(model.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))

18131
18131
18131


**Compare against the pathset links and paths files**

In [165]:
print len(obs.groupby('unique_id').count().index)
print len(pathset_links.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_links['unique_id'].values))))
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
pathset_links = pathset_links[pathset_links['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((model['unique_id'].values))))]
print len(obs.groupby('unique_id').count().index)
print len(pathset_links.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_links['unique_id'].values))))

18131
18131
18131


In [167]:
print len(obs.groupby('unique_id').count().index)
print len(pathset_paths.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))
obs = obs[obs['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))]
pathset_paths = pathset_paths[pathset_paths['unique_id'].isin(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))]
print len(obs.groupby('unique_id').count().index)
print len(pathset_paths.groupby('unique_id').count().index)
print len(list(set((obs['unique_id'].values)).intersection((pathset_paths['unique_id'].values))))

18131
18131
18131
18131
18131
18131


# Join correct transit route IDs to OBS data

In [168]:
obs_to_gtfs = pd.read_csv('../data/obs/obs_to_gtfs_v1.8.csv')
agency_lookup = pd.read_csv('../data/obs/obs_agency_lookup.csv')

In [172]:
# obs

In [169]:
obs = pd.merge(obs,obs_to_gtfs,left_on='route_id',right_on='OBS_route_id',how='left')

# drop the obs prefix
obs['route_id'] = obs['GTFS1.8_route_id']
obs['agency'] = obs['GTFS1.8_agency']
obs.drop(['GTFS1.8_agency','OBS_route_id','OBS_agency','Unnamed: 4'], axis=1, inplace=True)

obs.fillna("",inplace=True)

KeyError: 'GTFS1.8_route_id'

In [None]:
# df = pd.merge(model, obs_to_gtfs, left_on='route_id',right_on='GTFS1.8_route_id')
print len(pathset_paths.groupby('unique_id').count().index)

In [None]:
# routes.head()

In [None]:
# obs_to_gtfs

# First get the long route_id from the routes.txt file
routes = pd.read_csv(r'../data/gtfs/routes.txt')
model = pd.merge(model,routes,left_on='route_id',right_on='route_id',how='left')
model['agency'] = model['agency_id']
model.drop('agency_id',axis=1,inplace=True)
model['agency'].fillna('',inplace=True)

In [170]:
# pathset_links

In [20]:
pathset_links = pd.merge(pathset_links,routes,left_on='route_id',right_on='route_id',how='left')
pathset_links['agency'] = pathset_links['agency_id']
# model.drop('agency_id_x',axis=1,inplace=True)
pathset_links['agency'].fillna("",inplace=True)

In [22]:
# pathset_links

In [23]:
# Add the route ID for the pathset_paths too

In [24]:
# obs_to_gtfs

In [None]:
## Attach an agency ID to the modeled results too


In [None]:
len(model)

In [25]:
def produce_path_fields(df, group):
    '''
    '''
    # create "path_routes"
    df['path_routes'] = df['route_id'].apply(lambda x: x.strip())
    path_routes = pd.DataFrame(df.groupby(group)['path_routes'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    result_df = pd.DataFrame(index=path_routes.index)
    result_df['path_routes'] = path_routes
    
    # create "path_modes"
    df['path_modes'] = df['mode'].apply(lambda x: x.strip())
    result_df['path_modes'] = pd.DataFrame(df.groupby(group)['mode'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # create "path_agencies"
    df['path_agencies'] = df['agency'].apply(lambda x: x.strip())
    result_df['path_agencies'] = pd.DataFrame(df.groupby(group)['agency'].apply(lambda x: "%s" % ' '.join(x).strip()))

    # Create "path_components"
    df['path_components'] = df['A_id'].astype('str')+" "+df['mode']+" "+df['route_id'] +"_"+ df['B_id'].astype('str')
    df['path_components'] = df['path_components'].apply(lambda x: x.strip())
    result_df['path_components'] = pd.DataFrame(df.groupby(group)['path_components'].apply(lambda x: "%s" % ' '.join(x).strip()))
    
    # Return ID field from index
    result_df['unique_id'] = result_df.index.get_level_values(0).values
    

    return result_df

In [26]:
model[model['person_id'] == "3"]

Unnamed: 0,index,person_id,trip_list_id_num,pf_iteration,pathnum,linkmode,trip_id_num,A_id_num,B_id_num,A_seq,...,board_time,overcap,overcap_frac,alight_time,iteration,unique_id,route_short_name,route_long_name,route_type,agency
0,76967,3,1,1,4,access,,105000,7309,-1,...,,,,,2,3_1,,,,
1,76968,3,1,1,4,transit,8279.0,7309,7134,38,...,07:51:46,-63.0,-63.0,08:25:20,2,3_1,391.0,391_SB,3.0,samtrans
2,76969,3,1,1,4,egress,,7134,105279,-1,...,,,,,2,3_1,,,,


In [27]:
obs[obs['person_id'] == '3']

Unnamed: 0,index,person_id,linkmode,A_id_num,B_id_num,A_id,B_id,linknum,mode,route_id,trip_list_id_num,unique_id,GTFS1.8_route_id,agency
10660,13894,3,access,1280,,1280.0,,0,walk_access,,1,3_1,,
10661,13895,3,transit,0,,,,1,commuter_rail,,1,3_1,,caltrain
10662,13896,3,egress,0,1315.0,,1315.0,2,walk_egress,,1,3_1,,


# Produce joined fields from pathset link files 

In [28]:
# Create caluclated joined fields for the observed, modeled, and pathset links file

# Concatenate modes, route IDs, etc to produce unique trip identities
# Do this for each set of trips in the oberved data, as well as for the modeled, since we don't have the field produced by FT
observed_path = produce_path_fields(obs, group=['unique_id'])

In [32]:
modeled_path = produce_path_fields(model, group=['unique_id'])

In [33]:
# also process the detailed pathset_links files, so each path in the pathset has a unique trip identity
new_pathset = produce_path_fields(pathset_links, group=['unique_id','pathnum'])

### Weirdness with unique_id mismatch between model/obs and the pathset_links file

In [34]:


################################
# For some reason there is not a smooth overlap between the modeled pathset_link unique_ids and the model/observed data
# there are unique IDs missing from each, so I guess we just take an intersection of what's available for all of them?
print len(new_pathset)
print len(model)
print len(obs)

305995
94335
74701


In [35]:
# pathset_links['unique_id'] == ''

In [36]:
obs = obs[obs['unique_id'].isin(new_pathset['unique_id'].values)]
model = model[model['unique_id'].isin(new_pathset['unique_id'].values)]
new_pathset = new_pathset[new_pathset['unique_id'].isin(obs['unique_id'].values)]
new_pathset = new_pathset[new_pathset['unique_id'].isin(model['unique_id'].values)]

In [37]:
print len(new_pathset)
print len(model)
print len(obs)

305995
94335
74701


## Compare if modeled/observed trips match, completed or partially

In [40]:
# df.columns

In [94]:
# Join the observed and modeled fields
df = pd.merge(observed_path, modeled_path, on='unique_id',suffixes=("_observed","_model"))

In [95]:
# Find rows with matching path routes
complete_route_match = df[df['path_routes_observed'] == df['path_routes_model']]
complete_mode_match = df[df['path_modes_observed'] == df['path_modes_model']]
# Add complete_agency_match when available
complete_agency_match = df[df['path_agencies_observed'] == df['path_agencies_model']]

In [96]:
## Extract order of transit routes taken
df['model_path_route_list'] = df['path_routes_model'].apply(lambda x: x.split(" "))
df['obs_path_route_list'] = df['path_routes_observed'].apply(lambda x: x.split(" "))

df['model_path_mode_list'] = df['path_modes_model'].apply(lambda x: x.split(" "))
df['obs_path_mode_list'] = df['path_modes_observed'].apply(lambda x: x.split(" "))

df['model_path_agencies_list'] = df['path_agencies_model'].apply(lambda x: x.split(" "))
df['obs_path_agencies_list'] = df['path_agencies_observed'].apply(lambda x: x.split(" "))

In [97]:
# Isolate transit modes only, because almost all trips have walk & transfer components
non_transit_modes = ['transfer','walk_access','walk_egress','bike_access','bike_egress',
                     'PNR_access','PNR_egress','KNR_access','KNR_egress']
df['model_transit_modes'] = df['model_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])
df['obs_transit_modes'] = df['obs_path_mode_list'].apply(
    lambda row: [element for element in row if element not in non_transit_modes])

In [98]:

# Find the intersection between the chosen model/observed paths using different criteria

# transit route IDs only
df.apply(lambda row: all(i in row['model_path_route_list'] for i in row['obs_path_route_list']), axis=1)
df['routes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_route_list'], df['obs_path_route_list'])]

# All Modes (including transfer, access/egress)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['all_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_mode_list'], df['obs_path_mode_list'])]

# Transit modes only (type of vehicle taken and number of boardings)
df.apply(lambda row: all(i in row['model_path_mode_list'] for i in row['obs_path_mode_list']), axis=1)
df['transit_modes_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_mode_list'], df['obs_path_mode_list'])]

# Next steps: 
# Agency Intersection
df.apply(lambda row: all(i in row['model_path_agencies_list'] for i in row['obs_path_agencies_list']), axis=1)
df['transit_agencies_intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df['model_path_agencies_list'], df['obs_path_agencies_list'])]

# All path components

# Number of boardings?

In [99]:
# df[['path_agencies_observed','path_agencies_model']]

# Exact Match of Path Routes, Modes, Components

In [100]:
# Exact match of path modes
complete_mode_match['complete_mode_match'] = 1
# complete_agency_match['complete_agency_match'] = 1
df = pd.merge(df, complete_mode_match[['unique_id','complete_mode_match']], how='left', on='unique_id')
df['complete_mode_match'].fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [101]:
complete_route_match['complete_route_match'] = 1
df = pd.merge(df, complete_route_match[['unique_id','complete_route_match']], how='left', on='unique_id')

In [114]:
complete_agency_match['complete_agency_match'] = 1
df = pd.merge(df, complete_agency_match[['unique_id','complete_agency_match']], how='left', on='unique_id')

In [116]:
df['complete_route_match'] = df['complete_route_match'].replace('nan',0)
df['complete_agency_match'] = df['complete_agency_match'].replace('nan',0)

In [107]:
df.columns

Index([u'path_routes_observed', u'path_modes_observed', u'path_agencies_observed', u'path_components_observed', u'unique_id', u'path_routes_model', u'path_modes_model', u'path_agencies_model', u'path_components_model', u'model_path_route_list', u'obs_path_route_list', u'model_path_mode_list', u'obs_path_mode_list', u'model_path_agencies_list', u'obs_path_agencies_list', u'model_transit_modes', u'obs_transit_modes', u'routes_intersection', u'all_modes_intersection', u'transit_modes_intersection', u'transit_agencies_intersection', u'complete_mode_match', u'complete_route_match'], dtype='object')

In [108]:
df['complete_route_match'].mean()

0.0

In [109]:
df['complete_mode_match'].mean()

0.19684518228448514

In [118]:
df['complete_agency_match'].mean()

0.20346368098836248

## Export fields to CSV for Tableau:
- 'complete_route_match' 0/1
- 'complete_mode_match' 0/1
- 'complete_agency_match' 0/1

In [119]:
len(df)

18131

# % trips with matching or partial matching routes

In [40]:
# Now we find the percent of trips with matching routes or partial matching routes
# df.mean()[['complete_match','partial_match']]
# Note, shuold be 100% when using the same data set, what's up??

#############
# Join the filtered data to the original results
df['common_mode_count'] = [len(row) for row in df['all_modes_intersection']]
df['common_transit_mode_count'] = [len(row) for row in df['transit_modes_intersection']]

# How many rows have at least one mode in common?
df['partial_mode_match'] = [1 if row > 0 else 0 for row in df['common_mode_count']]
df['partial_transit_mode_match'] = [1 if row > 0 else 0 for row in df['common_transit_mode_count']]

In [41]:

df['partial_mode_match'].mean()

0.9986763002592245

In [42]:
df['partial_transit_mode_match'].mean()

0.9986763002592245

## Export fields to CSV for Tableau:
- 'partial_mode_match' 0/1
- 'complete_mode_match' 0/1

# Compare probability of observed path to pathset
# & Check if path is in pathset


In [43]:
observed_path.head(3)

Unnamed: 0_level_0,path_routes,path_modes,path_components,unique_id
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100066_15335,Sonoma County_62,walk_access local_bus walk_egress,2361.0 walk_access _ local_bus Sonoma County_6...,100066_15335
1000_431,MUNI Caltrain_San Francisco Burlingame Tri-...,walk_access local_bus transfer commuter_rail w...,930.0 walk_access _24893.0 24893.0 local_bus M...,1000_431
1000_432,MUNI Caltrain_San Francisco Burlingame Tri-...,walk_access local_bus transfer commuter_rail w...,930.0 walk_access _24893.0 24893.0 local_bus M...,1000_432


In [44]:
new_pathset.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,path_routes,path_modes,path_components,unique_id
unique_id,pathnum,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100066_15335,0,76_14MNO,walk_access local_bus walk_egress,2361 walk_access _9376 9376 local_bus 76_14MNO...,100066_15335
100066_15335,1,74_44NBL,walk_access local_bus walk_egress,2361 walk_access _9376 9376 local_bus 74_44NBL...,100066_15335
100066_15335,2,74_62NBL,walk_access local_bus walk_egress,2361 walk_access _9376 9376 local_bus 74_62NBL...,100066_15335


In [45]:
## Add a field to the new_pathset that lists the pathnum
new_pathset['pathnum'] = new_pathset.index.get_level_values(1)

In [89]:
len(new_pathset)

305995

In [90]:
len(observed_path)

18131

In [93]:
len(df)

46866

In [109]:
# Do this with a merge?
# for path modes only
df = pd.merge(observed_path, new_pathset, how='left',
              left_on=['unique_id','path_modes'],right_on=['unique_id','path_modes'], suffixes=['_obs','_pathset'])

df['path_modes_obs'] = df['path_modes']
df.drop('path_modes',axis=1, inplace=True)
len(df)

46866

In [114]:
df = (pd.merge(df, modeled_path[['unique_id','path_modes']], how='left'))
df['path_modes_pathset'] = df['path_modes']
df.drop('path_modes',axis=1,inplace=True)

In [116]:
print len(df)
print len(observed_path)
print len(new_pathset)

46866
18131
305995


In [117]:
# How many unique ID's don't have a pathset attacheds
len(df[df['pathnum'].isnull()])/float(len(df))

0.28141936585157684

In [118]:
df.head(3)

Unnamed: 0,path_routes_obs,path_components_obs,unique_id,path_routes_pathset,path_components_pathset,pathnum,path_modes_obs,path_modes_pathset
0,Sonoma County_62,2361.0 walk_access _ local_bus Sonoma County_6...,100066_15335,76_14MNO,2361 walk_access _9376 9376 local_bus 76_14MNO...,0.0,walk_access local_bus walk_egress,walk_access premium_bus walk_egress
1,Sonoma County_62,2361.0 walk_access _ local_bus Sonoma County_6...,100066_15335,74_44NBL,2361 walk_access _9376 9376 local_bus 74_44NBL...,1.0,walk_access local_bus walk_egress,walk_access premium_bus walk_egress
2,Sonoma County_62,2361.0 walk_access _ local_bus Sonoma County_6...,100066_15335,74_62NBL,2361 walk_access _9376 9376 local_bus 74_62NBL...,2.0,walk_access local_bus walk_egress,walk_access premium_bus walk_egress


In [119]:
df['pathnum'] = df['pathnum'].fillna(0)
df['pathnum'] = df['pathnum'].astype('int')

In [120]:
# Do we already have a unique ID? Try it anyway
pathset_paths['unique_id'] = pathset_paths['person_id'].astype('int').astype('str')+"_"+pathset_paths['trip_list_id_num'].astype('int').astype('str')

In [121]:
# Now look up the probability of each path between new_pathset and pathset_paths
newdf = pd.merge(df,pathset_paths,left_on=['unique_id','pathnum'], right_on=['unique_id','pathnum'])

In [122]:
newdf['probability'] = newdf['probability'].fillna('no_match')

In [123]:
max_prob = newdf.groupby('unique_id').max()['probability']

# to take the mean, need to do some filtering
# mean_prob = newdf.groupby('unique_id').()['probability']
min_prob = newdf.groupby('unique_id').min()['probability']

In [124]:
#Create indicator for paths that exist
# New dataframe that has prob matching record for each unique ID
prob_export = pd.DataFrame([max_prob,min_prob]).T
prob_export.columns = ['max_prob','min_prob']

In [125]:
# Pull binary data for each person
prob_export['path_exists'] = prob_export['max_prob'].apply(lambda row_value: 0 if row_value == 'no_match' else 1)

In [126]:
prob_export['path_exists'].mean()

0.7417682422370525

In [127]:
# Path exists in the pathset, based on mode strings, for 25% of all trips

In [128]:
# Is the max probability above a defined threshold?
threshold = 0.3
# Mark no_match_records
prob_export.ix[prob_export['max_prob'] >= threshold, 'above_threshold'] = 1
prob_export.ix[prob_export['max_prob'] < threshold, 'above_threshold'] = 0
prob_export.ix[prob_export['max_prob'] == 'no_match', 'above_threshold'] = 'no_match'

In [129]:
# Percent of trips above a threshold
prob_export[prob_export['above_threshold'] != "no_match"].mean()

max_prob          0.48796
min_prob          0.29901
path_exists       1.00000
above_threshold   0.63953
dtype: float64

In [130]:
###
# 63% of observed paths have a max pathest probability greather than a threshold of 0.3


## Export the trip-based (unique_id) records as a new input for Tableau

# Join all relevant columns & export to csv

In [131]:
prob_export['unique_id'] = prob_export.index

In [132]:
export_df = pd.merge(df, prob_export, on='unique_id')

In [133]:
len(export_df) == len(df) == len(prob_export)

False

In [134]:
# We can consider dropping columns if needed
export_df.columns

Index([u'path_routes_obs', u'path_components_obs', u'unique_id', u'path_routes_pathset', u'path_components_pathset', u'pathnum', u'path_modes_obs', u'path_modes_pathset', u'max_prob', u'min_prob', u'path_exists', u'above_threshold'], dtype='object')

In [135]:
export_df['person_id'] = export_df['unique_id'].apply(lambda row: row.split("_")[0])
export_df['trip_list_id_num'] = export_df['unique_id'].apply(lambda row: row.split("_")[-1])

In [136]:
export_df.to_csv('path_comparison.csv', index=False)