# Part 03: Find the discontinuities in Seattle's streets
michael babb  
2024 11 24

In [1]:
# standard
import os

In [2]:
# external
import geopandas as gpd
import matplotlib as mpl
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# custom
import run_constants as rc
from utils import *

# load the working seattle street network data

In [4]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_SND_WORKING_IN_FILE_NAME)

In [5]:
gdf = gpd.read_file(filename = fpn)

In [6]:
gdf.shape

(25773, 38)

In [7]:
gdf['ord_stname_concat'].unique().shape

(2455,)

## load the manually created street group data
These data were created in Step 6, but we'll bring them into the Step 3 workflow.

In [8]:
fpn = os.path.join(rc.INPUT_FILE_PATH, rc.S03_STREET_GROUP_IN_FILE_NAME)

In [9]:
index_cols = ['sort_order', 'ord_street_name', 'ord_street_type', 'city_portion']
sg_df = pd.read_excel(io = fpn, index_col=[0, 1, 2, 3])
sg_df = sg_df.reset_index()

In [10]:
drop_cols = ['sort_order', 'progress', 'count']
sg_df = sg_df.drop(labels = drop_cols, axis = 1)


In [11]:
test_join = pd.merge(left = gdf, right = sg_df, how = 'outer', indicator=True)
test_join['_merge'].value_counts()

_merge
both          19034
left_only      6739
right_only        0
Name: count, dtype: int64

In [12]:
gdf = test_join.copy()

In [13]:
gdf['group_id'] = gdf['group_id'].fillna(1).astype(int)
gdf.shape

(25773, 40)

# create an id across street groups

In [14]:
gdf.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacode,citycode,stname_id,st_code,arterial_code,segment_type,agency_code,...,r_zip,sndseg_update,compkey,comptype,unitid,unitid2,city_portion,geometry,group_id,_merge
0,3836,3893,4787,5,1,1955,0,1,1,1,...,98177,2011-09-09 00:00:00+00:00,18112,68,14050,40,NW,"LINESTRING (-122.36206 47.7035, -122.36342 47....",1,both
1,3893,3906,4802,5,1,1955,0,1,1,1,...,98177,2011-09-09 00:00:00+00:00,18113,68,14050,43,NW,"LINESTRING (-122.36342 47.70275, -122.36386 47...",1,both
2,3906,3973,4864,5,1,1955,0,1,1,1,...,98177,2019-08-13 14:00:50+00:00,18114,68,14050,70,NW,"LINESTRING (-122.36386 47.70251, -122.36535 47...",1,both
3,48444,48443,49871,1,0,3767,0,0,1,1,...,98146,2022-07-19 16:58:07+00:00,0,0,0,0,SW,"LINESTRING (-122.3403 47.51334, -122.34036 47....",2,both
4,4014,4015,4911,1,1,1465,0,0,1,1,...,98133,2005-05-04 00:00:00+00:00,14968,68,11695,110,N,"LINESTRING (-122.34467 47.70144, -122.34197 47...",1,both


In [15]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_portion', 'geometry',
       'group_id', '_merge'],
      dtype='object')

In [16]:
col_names = ['ord_street_name', 'ord_street_type',  'ord_stname_concat', 'city_portion', 'group_id']
id_df = gdf[col_names].drop_duplicates()

In [17]:
id_df['ord_stname_type'] = id_df['ord_street_name'] + ' ' + id_df['ord_street_type']

In [18]:
id_df['sort_order'] = id_df['ord_street_name'].map(get_sort_order)

In [19]:
id_df = id_df.sort_values(by = ['sort_order', 'ord_street_type', 'city_portion'])

In [20]:
id_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_stname_concat,city_portion,group_id,ord_stname_type,sort_order
3387,1ST,AVE,1ST AVE,CNTR,2,1ST AVE,1
3412,1ST,AVE,1ST AVE N,N,2,1ST AVE,1
3449,1ST,AVE,1ST AVE NE,NE,2,1ST AVE,1
3504,1ST,AVE,1ST AVE NW,NW,1,1ST AVE,1
3578,1ST,AVE,1ST AVE S,S,2,1ST AVE,1


In [21]:
id_df['ord_stname_concat'].unique().shape

(2455,)

In [22]:
# let's get the groups! can we aggregate this?

In [23]:
col_names = ['sort_order', 'ord_street_name', 'ord_street_type', 'ord_stname_type', 'group_id', 'city_portion']
test_agg = id_df[col_names].groupby(col_names[:-1]).agg(city_portion_group = ('city_portion', lambda x: '_'.join(sorted(set(x)))),
                                                        n_groups = ('city_portion', lambda x: len(set(x)))).reset_index()

In [24]:
test_agg.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_portion_group,n_groups
0,1,1ST,AVE,1ST AVE,1,NW_SW_W,3
1,1,1ST,AVE,1ST AVE,2,CNTR_N_NE_S,4
2,1,1ST,PL,1ST PL,1,NE,1
3,2,2ND,AVE,2ND AVE,1,NW_SW_W,3
4,2,2ND,AVE,2ND AVE,2,CNTR_N_NE_S,4


In [25]:
test_agg['city_portion_group'].unique().shape

(33,)

In [26]:
test_agg.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_portion_group,n_groups
0,1,1ST,AVE,1ST AVE,1,NW_SW_W,3
1,1,1ST,AVE,1ST AVE,2,CNTR_N_NE_S,4
2,1,1ST,PL,1ST PL,1,NE,1
3,2,2ND,AVE,2ND AVE,1,NW_SW_W,3
4,2,2ND,AVE,2ND AVE,2,CNTR_N_NE_S,4


In [28]:
test_cpg = 'CNTR_W'
test_ost = 'BR'
check_df = test_agg.loc[(test_agg['city_portion_group'] == test_cpg) &
                        (test_agg['ord_street_type'] == test_ost), ]
print(check_df.shape)
check_df.head()

(1, 7)


Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_portion_group,n_groups
1251,MAGNOLIA,MAGNOLIA,BR,MAGNOLIA BR,1,CNTR_W,2


In [28]:
# export to excel
city_portion_check = test_agg[['city_portion_group', 'ord_street_type', 'n_groups']].drop_duplicates().reset_index(drop = True)

In [None]:
city_portion_check.head()

In [None]:
city_portion_check.shape

In [31]:
city_portion_check = city_portion_check.sort_values(by = ['city_portion_group', 'ord_street_type'])

In [32]:
city_portion_check.to_excel(excel_writer='city_portion_check.xlsx', index = False)

In [None]:
test_agg.head()

In [None]:
test_agg.shape

In [35]:
# join back to the working gdf

In [None]:
gdf.columns

In [37]:
gdf = gdf.drop(labels = ['_merge'], axis = 1)

In [38]:
test_join = pd.merge(left = gdf, right = test_agg, how = 'outer', indicator=True)

In [None]:
test_join.shape

In [None]:
test_join.columns

In [None]:
test_join['_merge'].value_counts()

In [42]:
gdf = test_join.copy()

# MAKE A PLOT OF THE DIFFERENT CITY GROUPS BY STREET TYPE

In [43]:
# use the same colors for each city portion
cdm = {}
dir_list = ['CNTR', 'E', 'N', 'NE', 'NW', 'S', 'SW', 'W']
for idir, dir_value in enumerate(dir_list):
    cdm[dir_value] = mpl.colormaps["Dark2"].colors[idir]

my_cmap = mpl.colors.ListedColormap([cdm[c] for c in dir_list])

In [44]:
wgdf = gdf.loc[gdf['n_groups'] > 1, :].copy()

In [None]:
wgdf.shape

In [46]:
col_names = ['city_portion_group', 'ord_street_type']
control_df = wgdf[col_names].drop_duplicates()

In [None]:
control_df.shape

In [None]:
control_df.head()

In [49]:
# plot bounds to use across each plot
bounds = [-122.4197794277490061,47.4803548409661005, -122.2200188105690017,47.7341482423694004]

In [50]:
# load the boundaries of the city sections
ifpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S02_CITY_SECTIONS_OUT_FILE_NAME)
cs_gdf = gpd.read_file(filename=ifpn)

In [None]:
control_df.head()

In [None]:
make_plots = True
if make_plots:
    for cr, crow in control_df.iterrows():
        cpg = crow['city_portion_group']
        ost = crow['ord_street_type']

        output_file_name = f"{ost}_{cpg}.png"
        ofpn = os.path.join(rc.S06_PLOT_OUTPUT_FILE_PATH_CITY_PORTION_GROUPS, output_file_name)
        print(ofpn)

        # now, let's make a map...
        pgdf = gdf.loc[(gdf['city_portion_group'] == cpg) &
                    (gdf['ord_street_type'] == ost), :]
        to_draw = pgdf[['city_portion', 'geometry']].dissolve(by = 'city_portion', as_index = False)
        #to_draw['coords'] = to_draw['geometry'].map(lambda x: x.centroid.coords[0])
        fig = plt.figure(layout = 'constrained', figsize = (5, 10))
        gs = GridSpec(1,1, figure = fig, height_ratios = [1])
        ax1 = fig.add_subplot(gs[0,0])
        ax1.set_xlim(bounds[0], bounds[2])
        ax1.set_ylim(bounds[1], bounds[3])
        cs_gdf.plot(ax = ax1, column = 'city_portion', alpha = .2)
        to_draw.plot(ax = ax1, column = 'city_portion', cmap = my_cmap,  linewidth = 5, legend = True)

        #for irrow, row in to_draw.iterrows():    
        #    ax1.annotate(text=row['city_portion'], xy=row['coords'], fontsize = 16 )

        ax1.set_axis_off()
        
        my_title = f"Street Type: {ost} | City Portion Group {cpg}"
        plt.title(label = my_title)

        #plt.show()

        fig.savefig(fname = ofpn)
        plt.close()

# load the node data

In [12]:
input_file_name = 'Street_Network_Nodes.gpkg'

In [13]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_NODE_IN_FILE_NAME)

In [14]:
node_gdf = gpd.read_file(filename = fpn)

In [None]:
node_gdf.head()

In [16]:
node_gdf['node_id'] = node_gdf['node_id'].astype(int)

In [17]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [None]:
node_gdf.head()

In [None]:
node_gdf.dtypes

In [20]:
# zap this into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# find missing segments

In [21]:
# create the ord_street_name and ord_street_type variable
gdf['ord_street_name_type'] = gdf['ord_street_name'] + ' ' + gdf['ord_street_type']

In [22]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [23]:
# only select columns names used in subsequent steps
col_names = ['f_intr_id',
't_intr_id',
'snd_id',
'snd_feacode',
'gis_seg_length',
'ord_street_name',
'ord_street_type',
'ord_street_name_type',
'ord_stname_concat',
'city_portion',
'geometry',
'weight',
'group_id']

In [24]:
gdf = gdf[col_names].copy()

In [None]:
# what's the current max snd ID?
gdf['snd_id'].max()

In [None]:
# what's 2^16? start adding IDs at this value?
2**16

In [None]:
gdf.columns

In [None]:
# here is where the discontinuities are identified and the "missing" segments are created.
# a full run, without saving intermediate data, takes less than a minute.
# saving intermediate data takes an additional ~5 minutes.

write_intermediate = False
use_v1 = False

# street_status
# 0: continuous street
# 1: discontinuous
# 2: added street

# hold the output
output_gdf_list = []
shortest_added_path = []
gdf['snd_group'] = int(0)
snd_group_count = -1
# use this value to start the ID number for the added segements.
temp_snd_id = 2**16
# get the list of unique names
snt_list = gdf['ord_street_name_type'].unique().tolist()

# uncomment to test specifc street
#snt_list = ['7TH AVE N', 'W GALER ST']
#snt_list = ['GALER ST']
#snt_list = ['15TH AVE W']
#snt_list = ['WOODLAWN AVE N']

# the trouble_list, for lack of a beter name, are streets with parallel edges
# we can record the names of these streets. 
trouble_list = []

for snt in snt_list[:None]:
    print(snt)
    
    # subset the gdf by ord_stname_concat
    sub_gdf = gdf.loc[gdf['ord_street_name_type'] == snt, :].copy().reset_index(drop = True)
    unique_id_list = sub_gdf['group_id'].unique().tolist()
    
    for gi in unique_id_list:
        
        temp_gdf = sub_gdf.loc[sub_gdf['group_id'] == gi, :].copy()
        
        # create the graph for a single street.
        # For example, all streets named W GALER ST are a single graph.
        # this is an undirected graph.
        fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
        
        if temp_gdf.shape[0] != fg.number_of_edges():
            trouble_list.append(snt)            
            
            # subsequent steps need a single-edge graph. 
            # Parallel edges are not allowed. 
            # so, let's return the edges last added to the graph.
            mod_edge_list = []
            for ed in fg.edges.data():
                mod_edge_list.append(ed[2]['snd_id'])
            temp_gdf = temp_gdf.loc[temp_gdf['snd_id'].isin(mod_edge_list), :]
            
        # get the list of nodes in the graph.
        node_list = list(fg.nodes)
        
        # a list of the snd groups - this is necessary to match street segments to nodes
        # nx.connected_components() identifies disconnected graphs
        for cc in nx.connected_components(G = fg):
            # cc is the sub graph
            sub_graph = fg.subgraph(cc).copy()
            # this is the edge data        
            edge_data_list = sub_graph.edges.data()        
            # this marks the street(s) in each sub graph
            snd_id_list = []
            for edl in edge_data_list:                        
                snd_id_list.append(edl[2]['snd_id'])        
            snd_group_count += 1
            # this will update the gdf per street name with the groups of streets
            temp_gdf.loc[temp_gdf['snd_id'].isin(snd_id_list), 'snd_group'] = snd_group_count     
        
        # this is the number of unique snd_groups
        # also the number of discontinuities
        snd_group_id_list = temp_gdf['snd_group'].unique()    
        
        n_discontinuities = len(snd_group_id_list)    
        
        if n_discontinuities > 1:
            # process for the sub graphs
            node_snd_group_dict = {}
            # a node - an intersection - can have more than one street.
            # we need to update the node snd group dict for every street
            # identify each edge that each node is on
            for ir, row in temp_gdf.iterrows():       
                fn = row['f_intr_id']
                tn = row['t_intr_id']
                #osn = row['ord_street_name']
                #osnt = row['ord_street_name_type']
                #osnc = row['ord_stname_concat']

                snd_group_id = row['snd_group']
                node_snd_group_dict[fn] = snd_group_id
                node_snd_group_dict[tn] = snd_group_id                                             
            
            # create a list of available edges - these are the missing segments
            # these are formed from the non-edges of the graph.        
            avail_edges = []

            # compute the degree centrality 
            node_centrality_dict = nx.degree_centrality(G = fg)

            # dictionary to hold available edges
            # let's only make the distance calculation once, yeah?
            # it's a simple calculation, but even more simple to store it. 
            node_dist_dict = {}
            # enumerate the non-edges
            non_edge_count = 0
            # the total number of non-edges
            diff_non_edge_count = 0
            # the number of non-edges that connect disconnected components
            for ne in nx.non_edges(graph = fg):    
                non_edge_count += 1
                # ne is a tuple of from / to nodes.
                # create available edges if the nodes are not on the same segment
                # this will decrease the potential solution space
                if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                    diff_non_edge_count += 1
                    # calculate the straight-line distance between two nodes. 
                    # convert to feet to match the existing distance / weight variable
                    weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                    
                    # build the output tuple
                    if use_v1:
                        # this does not use node centrality
                        output = (ne[0], ne[1], {'weight':weight})
                    else:
                        # let's get the node centrality and use its inverse to further weight distances                     
                        ne_0_nc = node_centrality_dict[ne[0]]
                        ne_1_nc = node_centrality_dict[ne[1]]
                        edge_weight = weight * (ne_0_nc * ne_1_nc)
                        output = (ne[0], ne[1], {'weight':edge_weight})                        
                    # add to the distance dict
                    node_dist_dict[(ne[0], ne[1])] = weight 
                    node_dist_dict[(ne[1], ne[0])] = weight 
                    avail_edges.append(output)

            # we just calculated all of the potential segments that span discontinuities
            # nx.k_edge_augmentation() creates the potential edges in a graph by adding as
            # few edges as possible. In a street network, with multiple disconnected
            # components, there is really one way to minimally connect the disparate components
            # to create full connectivity. And it's the shortest geographic segment in this case!
            # weighted graph traversal works by accumulating as little weight as possible. 
            
            # this will store the output
            data_list = []
            line_list = []
            # once nx.k_edge_augmentation() finishes, it returns a generator with the added edges that
            # ensure complete connectivity between all nodes.
            augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
            # enumerate the augmented edges
            for i_ae, ae in enumerate(augmented_edges):                
                # unpack
                fn, tn = ae            
                # get the weight / distance of the added edge
                weight = node_dist_dict[(fn, tn)] 
                # this is the output dictionary
                # street name, integer indicating the snd_group, from node, to node, distance of the edge

                # get node data
                fn_node_list = []                
                for curr_edge in fg.edges(fn):
                    edge_dict = fg.edges[curr_edge]
                    fn_node_list.append(edge_dict)
                
                fn_df = pd.DataFrame(data = fn_node_list)
                fn_osnc = fn_df['ord_stname_concat']

                tn_node_list = []
                for curr_edge in fg.edges(tn):
                    edge_dict = fg.edges[curr_edge]
                    tn_node_list.append(edge_dict)
                
                tn_df = pd.DataFrame(data = tn_node_list)
                tn_osnc = tn_df['ord_stname_concat']

                # ord_stname_concat: determine this by computing the set intersection
                osnc_set = set(fn_osnc).intersection(tn_osnc)
                if len(osnc_set) == 1:
                    osnc = osnc_set.pop()
                else:
                    osnc = ''

                # ord_street_name
                osn = temp_gdf['ord_street_name'].unique().tolist()[0]
                # ord_street_type
                ost = temp_gdf['ord_street_type'].unique().tolist()[0]

                # now, get the city portion
                tn_city_portion = set(tn_df['city_portion'])
                fn_city_portion = set(fn_df['city_portion'])
                cp = list(tn_city_portion.union(fn_city_portion))
                cp = '_'.join(sorted(cp))

                temp_snd_id += 1
                temp_data_list = [temp_snd_id, osn, ost, snt, osnc, i_ae, fn, tn, weight, cp, gi]
                # now, let's create some geometry
                temp_line = LineString([node_dict[fn], node_dict[tn]])
                line_list.append(temp_line)
                data_list.append(temp_data_list)

            # compare the list of available edges to the list of chosen edges
            # this isn't necessary to complete the edge augmentation, but it's
            # nice to know how often added edges are always the shortest edges        
            av_df = pd.DataFrame(data = avail_edges, columns = ['sn_id', 'en_id', 'weight_dict'])
            #av_df['dist'] = av_df['weight_dict'].map(lambda x: x['weight'])

            av_df['dist'] = av_df.apply(func=lambda x: node_dist_dict[(x['sn_id'], x['en_id'])], axis = 1)
            av_df = av_df.drop(labels = 'weight_dict', axis = 1)
            # rank distance - the lower the rank, the shorter the segment.
            av_df['dist_rank'] = av_df['dist'].rank(method = 'dense')
            
            # build a gpd.GeoDataFrame - these are the "missing" segments"
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type',
                         'ord_street_name_type', 'ord_stname_concat',
                         'snd_group', 'sn_id', 'en_id', 'dist', 'city_portion', 
                         'group_id']
            ms_gdf = gpd.GeoDataFrame(data = data_list,
                                            columns = col_names,
                                            geometry = line_list, crs = 'epsg:4326')
            # these edges are not on the same street group
            ms_gdf['street_status'] = int(2)
            # update where the connection spans city portions     
            ms_gdf.loc[ms_gdf['ord_stname_concat'] == '', 'street_status'] = 3

            # this will track if there are any graphs with parallel edges.
            if ms_gdf.empty:
                trouble_list.append(snt)            
            
            # select where the rank is LTE than the number of records in the missing segement gdf
            av_df = av_df.loc[av_df['dist_rank'] <= ms_gdf.shape[0], :]
            
            # if the sum of the distance of the added segments is the same, then only the
            # shortest segments were added. If not, then other, longer, segments were added
            # but that makes for less total distance traversed in the graph.
            # So, how often does that happen?
            shortest_tot_path = av_df['dist'].sum() == ms_gdf['dist'].sum()
            shortest_tot_path = int(shortest_tot_path)
            shortest_added_path.append([osn, ost, snt, osnc,  shortest_tot_path, n_discontinuities - 1])        
                    
            # add the known, disconnected segments. These are the existing streets.
            col_names = ['snd_id','ord_street_name', 'ord_street_type', 'ord_street_name_type',
                         'ord_stname_concat', 'snd_group',  'f_intr_id', 't_intr_id',
                         'gis_seg_length', 'city_portion', 'group_id', 'geometry']    
            ks_gdf = temp_gdf[col_names].copy()
            ks_gdf['street_status'] = int(1)
            ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

            # stack the geodataframes
            output_gdf = pd.concat([ms_gdf, ks_gdf])            
        
            # write intermediate
            if write_intermediate:
                # intermediate streets - existing streets
                is_output_file_path = os.path.join(rc.OUTPUT_FILE_PATH, 'individual_streets')
                if not os.path.exists(is_output_file_path):
                    os.makedirs(is_output_file_path)
                output_file_name = '_'.join(snt.split()) + '.gpkg'
                write_gdf(gdf = temp_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)
            
                # intermediate nodes
                curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
                #curr_node_list = curr_node_df['node_id'].tolist()
                curr_node_df['node_centrality'] = curr_node_df['node_id'].map(node_centrality_dict)
                node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
                output_file_name = '_'.join(snt.split()) + '_full_nodes.gpkg'
                write_gdf(gdf = node_subset_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

                # the added and continuous segments for a street
                output_file_name = '_'.join(snt.split()) + '_missing_segments.gpkg'
                ofpn = os.path.join(is_output_file_path, output_file_name)                
                write_gdf(gdf = output_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

        else:
            # gather the streets with no missing segments.         
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type', 
                         'ord_street_name_type', 'ord_stname_concat', 
                         'snd_group', 'f_intr_id', 't_intr_id', 'gis_seg_length', 
                         'city_portion', 'group_id', 'geometry']    
            output_gdf = temp_gdf[col_names].copy()
            output_gdf['street_status'] = int(0)
            output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # add to the output list
        output_gdf_list.append(output_gdf)

# combine data for output

In [29]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [30]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

In [31]:
# join in other street indentification data

In [None]:
ms_gdf.shape

In [None]:
ms_gdf['street_status'].value_counts()

In [None]:
ms_gdf.columns

In [None]:
# check for missing values
for cn in ms_gdf.columns:
    print(cn, ms_gdf[cn].isna().unique())    

In [None]:
# what streets have parallel edges?
# these can be verified / examined in qGIS
trouble_list

# compute if the added segments were indeed the shortest.

In [38]:
# the number of times when the shortest segment is the shortest
col_names = ['ord_street_name', 'ord_street_type', 'ord_street_name_type',
             'ord_stname_concat', 'shortest_added_path', 'n_discontinuities']                                      
sap_df = pd.DataFrame(data = shortest_added_path, columns = col_names)

In [39]:
sap_df['n_streets'] = int(1)

In [None]:
sap_df.head()

In [41]:
recode_dict = {0:'shortest_is_false', 1:'shortest_is_true', }

In [42]:
sap_df['shortest_added_path'] = sap_df['shortest_added_path'].map(recode_dict)

In [43]:
sap_df = sap_df.sort_values(by = ['ord_street_name_type', 'ord_stname_concat'])

In [44]:
sap_df['connection_status'] = "intraconnection"
sap_df.loc[sap_df['ord_stname_concat'] == '', 'connection_status'] = 'interconnection'

In [None]:
sap_df.head()

In [46]:
sap_df_ct_count = pd.pivot_table(data = sap_df, values = 'n_streets',
                                 index = ['connection_status', 'n_discontinuities'], columns = 'shortest_added_path',
                                 aggfunc='sum', fill_value=0, margins = True).reset_index(drop=False)

In [None]:
sap_df_ct_count.columns

In [None]:
sap_df_ct_count.head()

In [49]:
for recode_key, recode_value in recode_dict.items():
    if recode_value not in sap_df_ct_count.columns:
        sap_df_ct_count[recode_value] = [0] * sap_df_ct_count.shape[0]

In [None]:
sap_df_ct_count.head()

In [51]:
sap_df_ct_count = sap_df_ct_count.rename(columns = {'All':'n_streets'})
col_names = ['connection_status', 'n_discontinuities', 'shortest_is_false', 'shortest_is_true', 'n_streets']
sap_df_ct_count = sap_df_ct_count[col_names]

In [52]:
for cn in ['shortest_is_false', 'shortest_is_true']:
    ncn = cn + '_per'
    sap_df_ct_count[ncn] = sap_df_ct_count[cn] / sap_df_ct_count['n_streets']

In [None]:
sap_df_ct_count.head()

In [None]:
# export this to excel
if not os.path.exists(rc.ANALYSIS_OUTPUT_FILE_PATH):
    os.makedirs(rc.ANALYSIS_OUTPUT_FILE_PATH)
ofpn = os.path.join(rc.ANALYSIS_OUTPUT_FILE_PATH, rc.S03_SS_COUNT_OUT_FILE_NAME)
print(ofpn)

sap_df_ct_count.to_excel(excel_writer=ofpn, sheet_name='shortest_segment_count', index = False)

# add ranking by street type to help with visualization

In [55]:
st_type_df = ms_gdf.loc[ms_gdf['street_status'].isin((2,3)), 'ord_street_type'].value_counts().to_frame(name = 'n_segments').reset_index()

In [None]:
st_type_df.head()

In [57]:
st_type_df['segment_rank'] = st_type_df['n_segments'].rank(ascending = True).astype(int).astype(str).str.zfill(2)

In [None]:
st_type_df.head(n=20)

In [59]:
st_type_df['ord_street_type_rank'] = st_type_df['segment_rank'] + '_' + st_type_df['ord_street_type']

In [60]:
st_rank_dict = {ost:ostr for ost, ostr in zip(st_type_df['ord_street_type'], st_type_df['ord_street_type_rank'])}

In [61]:
ms_gdf['ord_street_type_rank'] = ms_gdf['ord_street_type'].map(st_rank_dict)

In [62]:
ms_gdf.loc[ms_gdf['ord_street_type_rank'].isna(), 'ord_street_type_rank'] = ""

In [None]:
ms_gdf.columns

In [65]:
# reorder columns
col_names = ['snd_id', 'ord_street_name','ord_stname_concat','ord_street_name_type',
             'ord_street_type', 'city_portion', 'ord_street_type_rank', 'snd_group',
               'street_status', 'group_id', 'sn_id','en_id','dist','dist_miles','geometry']

In [66]:
ms_gdf = ms_gdf[col_names]

# save the geodataframe with the continuous, discontinuous, and added streets

In [67]:
write_gdf(gdf = ms_gdf, output_file_path= rc.OUTPUT_FILE_PATH,
          output_file_name = rc.S03_MISSING_OUT_FILE_NAME)