# Part 03: Find the discontinuities in Seattle's streets
michael babb  
2024 11 24

In [1]:
# standard
import os

In [2]:
# external
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point

In [3]:
# custom
import run_constants as rc
from utils import *

# load the working seattle street network data

In [4]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_SND_WORKING_IN_FILE_NAME)

In [5]:
gdf = gpd.read_file(filename = fpn)

In [6]:
gdf.shape

(25773, 38)

In [7]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, 'Street_Network_Database_Seattle_working_v2.gpkg')

In [8]:
gdf = gpd.read_file(filename = fpn)

In [9]:
gdf.shape

(25773, 40)

# load the node data

In [10]:
input_file_name = 'Street_Network_Nodes.gpkg'

In [11]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_NODE_IN_FILE_NAME)

In [12]:
node_gdf = gpd.read_file(filename = fpn)

In [13]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,17740,POINT (-122.32287 47.52982)
1,11391,POINT (-122.32402 47.61849)
2,982,POINT (-122.29193 47.7649)
3,11695,POINT (-122.30782 47.61411)
4,6257,POINT (-122.38214 47.67456)


In [14]:
node_gdf['node_id'] = node_gdf['node_id'].astype(int)

In [15]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [16]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,17740,POINT (-122.32287 47.52982),"(-122.32287444824671, 47.529819957875)"
1,11391,POINT (-122.32402 47.61849),"(-122.32401964037119, 47.61848906651073)"
2,982,POINT (-122.29193 47.7649),"(-122.29192836110747, 47.764904310419276)"
3,11695,POINT (-122.30782 47.61411),"(-122.30781531879083, 47.61410595110372)"
4,6257,POINT (-122.38214 47.67456),"(-122.38214200036303, 47.67455801597557)"


In [17]:
node_gdf.dtypes

node_id        int32
geometry    geometry
coords        object
dtype: object

In [18]:
# zap this into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# find missing segments

In [19]:
# create the ord_street_name and ord_street_type variable
gdf['ord_street_name_type'] = gdf['ord_street_name'] + ' ' + gdf['ord_street_type']

In [20]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [21]:
# only select columns names used in subsequent steps
col_names = ['f_intr_id',
't_intr_id',
'snd_id',
'snd_feacode',
'gis_seg_length',
'ord_street_name',
'ord_street_type',
'ord_street_name_type',
'ord_stname_concat',
'city_portion',
'geometry',
'weight',
'group_id']

In [22]:
gdf = gdf[col_names].copy()

In [23]:
# what's the current max snd ID?
gdf['snd_id'].max()

50338

In [24]:
# what's 2^16? start adding IDs at this value?
2**16

65536

In [25]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'gis_seg_length',
       'ord_street_name', 'ord_street_type', 'ord_street_name_type',
       'ord_stname_concat', 'city_portion', 'geometry', 'weight', 'group_id'],
      dtype='object')

In [90]:
# here is where the discontinuities are identified and the "missing" segments are created.
# a full run, without saving intermiedate data, takes less than a minute.
# saving intermediate data takes an additional ~5 minutes.

# save intermediate data for checking?

write_intermediate = False
use_v1 = False

# street_status
# 0: continuous street
# 1: discontinuous
# 2: added street

# hold the output
output_gdf_list = []
shortest_added_path = []
gdf['snd_group'] = int(0)
snd_group_count = -1
# use this value to start the ID number for the added segements.
temp_snd_id = 2**16
# get the list of unique names
snt_list = gdf['ord_street_name_type'].unique().tolist()

# uncomment to test specifc street
#snt_list = ['7TH AVE N', 'W GALER ST']
#snt_list = ['GALER ST']
#snt_list = ['15TH AVE W']
#snt_list = ['WOODLAWN AVE N']

# the trouble_list, for lack of a beter name, are streets with parallel edges
# we can record the names of these streets. 
trouble_list = []

for snt in snt_list[:None]:
    print(snt)
    
    # subset the gdf by ord_stname_concat
    sub_gdf = gdf.loc[gdf['ord_street_name_type'] == snt, :].copy().reset_index(drop = True)
    unique_id_list = sub_gdf['group_id'].unique().tolist()
    
    for gi in unique_id_list:
        
        temp_gdf = sub_gdf.loc[sub_gdf['group_id'] == gi, :].copy()
        
        # create the graph for a single street.
        # For example, all streets named W GALER ST are a single graph.
        # this is an undirected graph.
        fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
        
        if temp_gdf.shape[0] != fg.number_of_edges():
            trouble_list.append(snt)            
            
            # subsequent steps need a single-edge graph. 
            # Parallel edges are not allowed. 
            # so, let's return the edges last added to the graph.
            mod_edge_list = []
            for ed in fg.edges.data():
                mod_edge_list.append(ed[2]['snd_id'])
            temp_gdf = temp_gdf.loc[temp_gdf['snd_id'].isin(mod_edge_list), :]
            
        # get the list of nodes in the graph.
        node_list = list(fg.nodes)
        
        # a list of the snd groups - this is necessary to match street segments to nodes
        # nx.connected_components() identifies disconnected graphs
        for cc in nx.connected_components(G = fg):
            # cc is the sub graph
            sub_graph = fg.subgraph(cc).copy()
            # this is the edge data        
            edge_data_list = sub_graph.edges.data()        
            # this marks the street(s) in each sub graph
            snd_id_list = []
            for edl in edge_data_list:                        
                snd_id_list.append(edl[2]['snd_id'])        
            snd_group_count += 1
            # this will update the gdf per street name with the groups of streets
            temp_gdf.loc[temp_gdf['snd_id'].isin(snd_id_list), 'snd_group'] = snd_group_count     
        
        # this is the number of unique snd_groups
        # also the number of discontinuities
        snd_group_id_list = temp_gdf['snd_group'].unique()    
        
        n_discontinuities = len(snd_group_id_list)    
        
        if n_discontinuities > 1:
            # process for the sub graphs
            node_snd_group_dict = {}
            # a node - an intersection - can have more than one street.
            # we need to update the node snd group dict for every street
            # identify each edge that each node is on
            for ir, row in temp_gdf.iterrows():       
                fn = row['f_intr_id']
                tn = row['t_intr_id']
                #osn = row['ord_street_name']
                #osnt = row['ord_street_name_type']
                #osnc = row['ord_stname_concat']

                snd_group_id = row['snd_group']
                node_snd_group_dict[fn] = snd_group_id
                node_snd_group_dict[tn] = snd_group_id                                             
            
            # create a list of available edges - these are the missing segments
            # these are formed from the non-edges of the graph.        
            avail_edges = []

            # compute the degree centrality 
            node_centrality_dict = nx.degree_centrality(G = fg)

            # dictionary to hold available edges
            # let's only make the distance calculation once, yeah?
            # it's a simple calculation, but even more simple to store it. 
            node_dist_dict = {}
            # enumerate the non-edges
            non_edge_count = 0
            # the total number of non-edges
            diff_non_edge_count = 0
            # the number of non-edges that connect disconnected components
            for ne in nx.non_edges(graph = fg):    
                non_edge_count += 1
                # ne is a tuple of from / to nodes.
                # create available edges if the nodes are not on the same segment
                # this will decrease the potential solution space
                if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                    diff_non_edge_count += 1
                    # calculate the straight-line distance between two nodes. 
                    # convert to feet to match the existing distance / weight variable
                    weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                    
                    # build the output tuple
                    if use_v1:
                        # this does not use node centrality
                        output = (ne[0], ne[1], {'weight':weight})
                    else:
                        # let's get the node centrality and use its inverse to further weight distances                     
                        ne_0_nc = node_centrality_dict[ne[0]]
                        ne_1_nc = node_centrality_dict[ne[1]]
                        edge_weight = weight * (ne_0_nc * ne_1_nc)
                        output = (ne[0], ne[1], {'weight':edge_weight})                        
                    # add to the distance dict
                    node_dist_dict[(ne[0], ne[1])] = weight 
                    node_dist_dict[(ne[1], ne[0])] = weight 
                    avail_edges.append(output)

            # we just calculated all of the potential segments that span discontinuities
            # nx.k_edge_augmentation() creates the potential edges in a graph by adding as
            # few edges as possible. In a street network, with multiple disconnected
            # components, there is really one way to minimally connect the disparate components
            # to create full connectivity. And it's the shortest geographic segment in this case!
            # weighted graph traversal works by accumulating as little weight as possible. 
            
            # this will store the output
            data_list = []
            line_list = []
            # once nx.k_edge_augmentation() finishes, it returns a generator with the added edges that
            # ensure complete connectivity between all nodes.
            augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
            # enumerate the augmented edges
            for i_ae, ae in enumerate(augmented_edges):                
                # unpack
                fn, tn = ae            
                # get the weight / distance of the added edge
                weight = node_dist_dict[(fn, tn)] 
                # this is the output dictionary
                # street name, integer indicating the snd_group, from node, to node, distance of the edge

                # get node data
                fn_node_list = []                
                for curr_edge in fg.edges(fn):
                    edge_dict = fg.edges[curr_edge]
                    fn_node_list.append(edge_dict)
                
                fn_df = pd.DataFrame(data = fn_node_list)
                fn_osnc = fn_df['ord_stname_concat']

                tn_node_list = []
                for curr_edge in fg.edges(tn):
                    edge_dict = fg.edges[curr_edge]
                    tn_node_list.append(edge_dict)
                
                tn_df = pd.DataFrame(data = tn_node_list)
                tn_osnc = tn_df['ord_stname_concat']

                # ord_stname_concat: determine this by computing the set intersection
                osnc_set = set(fn_osnc).intersection(tn_osnc)
                if len(osnc_set) == 1:
                    osnc = osnc_set.pop()
                else:
                    osnc = ''

                # ord_street_name
                osn = temp_gdf['ord_street_name'].unique().tolist()[0]
                # ord_street_type
                ost = temp_gdf['ord_street_type'].unique().tolist()[0]

                # now, get the city portion
                tn_city_portion = set(tn_df['city_portion'])
                fn_city_portion = set(fn_df['city_portion'])
                cp = list(tn_city_portion.union(fn_city_portion))
                cp = '_'.join(sorted(cp))

                temp_snd_id += 1
                temp_data_list = [temp_snd_id, osn, ost, snt, osnc, i_ae, fn, tn, weight, cp]
                # now, let's create some geometry
                temp_line = LineString([node_dict[fn], node_dict[tn]])
                line_list.append(temp_line)
                data_list.append(temp_data_list)

            # compare the list of available edges to the list of chosen edges
            # this isn't necessary to complete the edge augmentation, but it's
            # nice to know how often added edges are always the shortest edges        
            av_df = pd.DataFrame(data = avail_edges, columns = ['sn_id', 'en_id', 'weight_dict'])
            #av_df['dist'] = av_df['weight_dict'].map(lambda x: x['weight'])

            av_df['dist'] = av_df.apply(func=lambda x: node_dist_dict[(x['sn_id'], x['en_id'])], axis = 1)
            av_df = av_df.drop(labels = 'weight_dict', axis = 1)
            # rank distance - the lower the rank, the shorter the segment.
            av_df['dist_rank'] = av_df['dist'].rank(method = 'dense')
            
            # build a gpd.GeoDataFrame - these are the "missing" segments"
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type',
                         'ord_street_name_type', 'ord_stname_concat',
                         'snd_group', 'sn_id', 'en_id', 'dist', 'city_portion']
            ms_gdf = gpd.GeoDataFrame(data = data_list,
                                            columns = col_names,
                                            geometry = line_list, crs = 'epsg:4326')
            # these edges are not on the same street group
            ms_gdf['street_status'] = int(2)
            # update where the connection spans city portions     
            ms_gdf.loc[ms_gdf['ord_stname_concat'] == '', 'street_status'] = 3

            # this will track if there are any graphs with parallel edges.
            if ms_gdf.empty:
                trouble_list.append(snt)            
            
            # select where the rank is LTE than the number of records in the missing segement gdf
            av_df = av_df.loc[av_df['dist_rank'] <= ms_gdf.shape[0], :]
            
            # if the sum of the distance of the added segments is the same, then only the
            # shortest segments were added. If not, then other, longer, segments were added
            # but that makes for less total distance traversed in the graph.
            # So, how often does that happen?
            shortest_tot_path = av_df['dist'].sum() == ms_gdf['dist'].sum()
            shortest_tot_path = int(shortest_tot_path)
            shortest_added_path.append([osn, ost, snt, osnc,  shortest_tot_path, n_discontinuities - 1])        
                    
            # add the known, disconnected segments. These are the existing streets.
            col_names = ['snd_id','ord_street_name', 'ord_street_type', 'ord_street_name_type',
                         'ord_stname_concat', 'snd_group',  'f_intr_id', 't_intr_id',
                         'gis_seg_length', 'city_portion', 'geometry']    
            ks_gdf = temp_gdf[col_names].copy()
            ks_gdf['street_status'] = int(1)
            ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

            # stack the geodataframes
            output_gdf = pd.concat([ms_gdf, ks_gdf])            
        
            # write intermediate
            if write_intermediate:
                # intermediate streets - existing streets
                is_output_file_path = os.path.join(rc.OUTPUT_FILE_PATH, 'individual_streets')
                if not os.path.exists(is_output_file_path):
                    os.makedirs(is_output_file_path)
                output_file_name = '_'.join(snt.split()) + '.gpkg'
                write_gdf(gdf = temp_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)
            
                # intermediate nodes
                curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
                #curr_node_list = curr_node_df['node_id'].tolist()
                curr_node_df['node_centrality'] = curr_node_df['node_id'].map(node_centrality_dict)
                node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
                output_file_name = '_'.join(snt.split()) + '_full_nodes.gpkg'
                write_gdf(gdf = node_subset_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

                # the added and continuous segments for a street
                output_file_name = '_'.join(snt.split()) + '_missing_segments.gpkg'
                ofpn = os.path.join(is_output_file_path, output_file_name)                
                write_gdf(gdf = output_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

        else:
            # gather the streets with no missing segments.         
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type', 
                         'ord_street_name_type', 'ord_stname_concat', 
                         'snd_group', 'f_intr_id', 't_intr_id', 'gis_seg_length', 
                         'city_portion', 'geometry']    
            output_gdf = temp_gdf[col_names].copy()
            output_gdf['street_status'] = int(0)
            output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # add to the output list
        output_gdf_list.append(output_gdf)

8TH AVE
BELMONT AVE
63RD ST
MCCLINTOCK AVE
NORTH PARK AVE
37TH AVE
25TH AVE
WAITE ST
107TH ST
42ND ST
LINDEN AVE
40TH ST
FAUNTLEROY PL
GALER ST
WALKER ST
20TH AVE
BYRON ST
FOREST ST
VICTORY LN
18TH AVE
26TH AVE
29TH AVE
27TH AVE
17TH AVE
KIRKWOOD PL
7TH AVE
ROWAN RD
28TH AVE
130TH ST
38TH AVE
MARION ST
HOLLY ST
95TH ST
OLIVE ST
LOUISA ST
REPUBLICAN ST
15TH AVE
5TH AVE
45TH AVE
RAYE ST
DAKOTA ST
91ST ST
70TH ST
48TH AVE
102ND ST
FULTON ST
68TH ST
RUFFNER ST
35TH AVE
46TH AVE
CHERRY ST
21ST AVE
ALKI AVE
DAYTON AVE
90TH ST
ANDOVER ST
FREMONT AVE
THOMAS ST
WESTERN AVE
STONE AVE
94TH ST
123RD ST
24TH AVE
130TH PL
62ND ST
3RD AVE
WESTMONT WAY
FAIRVIEW AVE
55TH ST
105TH ST
47TH AVE
120TH ST
12TH AVE
42ND AVE
COLUMBIA ST
WOODLAWN AVE
CAMPUS PKWY
ASHWORTH AVE
MASSACHUSETTS ST
CRESCENT DR
ELMWOOD PL
MCGRAW ST
CHARLESTOWN ST
PERKINS LN
SUMMIT AVE
ANN ARBOR AVE
51ST AVE
22ND AVE
33RD AVE
MYERS WAY
LANGSTON RD
LAKE RIDGE DR
KENYON ST
64TH ST
67TH ST
EDMUNDS ST
98TH ST
6TH AVE
10TH AVE
JANSEN CT
CAL

# combine data for output

In [91]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [92]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

In [93]:
# join in other street indentification data

In [94]:
ms_gdf.shape

(29785, 13)

In [95]:
ms_gdf['street_status'].value_counts()

street_status
1    21940
0     3818
2     3615
3      412
Name: count, dtype: int64

In [96]:
ms_gdf.head()

Unnamed: 0,snd_id,ord_street_name,ord_street_type,ord_street_name_type,ord_stname_concat,snd_group,sn_id,en_id,dist,city_portion,geometry,street_status,dist_miles
0,65537,8TH,AVE,8TH AVE,8TH AVE S,0,18097,18114,204.196924,S,"LINESTRING (-122.32234 47.52483, -122.32313 47...",2,0.038674
1,65538,8TH,AVE,8TH AVE,8TH AVE S,1,17487,17351,643.858375,S,"LINESTRING (-122.32281 47.53338, -122.32258 47...",2,0.121943
2,65539,8TH,AVE,8TH AVE,8TH AVE,2,12225,12261,313.330763,CNTR,"LINESTRING (-122.32587 47.6051, -122.32533 47....",2,0.059343
3,65540,8TH,AVE,8TH AVE,,3,11344,11253,431.899865,CNTR_N,"LINESTRING (-122.34139 47.61855, -122.3411 47....",3,0.081799
4,65541,8TH,AVE,8TH AVE,8TH AVE NE,4,5669,5498,622.915283,NE,"LINESTRING (-122.31972 47.68148, -122.32021 47...",2,0.117976


In [97]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'gis_seg_length',
       'ord_street_name', 'ord_street_type', 'ord_street_name_type',
       'ord_stname_concat', 'city_portion', 'geometry', 'weight', 'group_id',
       'snd_group'],
      dtype='object')

In [98]:
ofpn = os.path.join(rc.OUTPUT_FILE_PATH, 'test_connection.gpkg')
ms_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

In [99]:
ms_gdf.shape

(29785, 13)

In [100]:
ms_gdf.head()

Unnamed: 0,snd_id,ord_street_name,ord_street_type,ord_street_name_type,ord_stname_concat,snd_group,sn_id,en_id,dist,city_portion,geometry,street_status,dist_miles
0,65537,8TH,AVE,8TH AVE,8TH AVE S,0,18097,18114,204.196924,S,"LINESTRING (-122.32234 47.52483, -122.32313 47...",2,0.038674
1,65538,8TH,AVE,8TH AVE,8TH AVE S,1,17487,17351,643.858375,S,"LINESTRING (-122.32281 47.53338, -122.32258 47...",2,0.121943
2,65539,8TH,AVE,8TH AVE,8TH AVE,2,12225,12261,313.330763,CNTR,"LINESTRING (-122.32587 47.6051, -122.32533 47....",2,0.059343
3,65540,8TH,AVE,8TH AVE,,3,11344,11253,431.899865,CNTR_N,"LINESTRING (-122.34139 47.61855, -122.3411 47....",3,0.081799
4,65541,8TH,AVE,8TH AVE,8TH AVE NE,4,5669,5498,622.915283,NE,"LINESTRING (-122.31972 47.68148, -122.32021 47...",2,0.117976


In [101]:
ms_gdf['street_status'].value_counts()

street_status
1    21940
0     3818
2     3615
3      412
Name: count, dtype: int64

In [102]:
# check for missing values
for cn in ms_gdf.columns:
    print(cn, ms_gdf[cn].isna().unique())    

snd_id [False]
ord_street_name [False]
ord_street_type [False]
ord_street_name_type [False]
ord_stname_concat [False]
snd_group [False]
sn_id [False]
en_id [False]
dist [False]
city_portion [False]
geometry [False]
street_status [False]
dist_miles [False]


In [103]:
# what streets have parallel edges?
# these can be verified / examined in qGIS
trouble_list

['25TH AVE',
 '40TH ST',
 'FOREST ST',
 '7TH AVE',
 'SUMMIT AVE',
 '1ST AVE',
 '43RD ST',
 'MERRILL LN',
 'LAWTON CIR',
 'ROYAL BROUGHAM WAY',
 'MAGNOLIA BR',
 'RADFORD DR',
 'SEWARD PARK RD']

# compute if the added segments were indeed the shortest.

In [109]:
# the number of times when the shortest segments
col_names = ['ord_street_name', 'ord_street_type', 'ord_street_name_type',
             'ord_stname_concat', 'shortest_added_path', 'n_discontinuities']                                      
sap_df = pd.DataFrame(data = shortest_added_path, columns = col_names)

In [110]:
sap_df['n_streets'] = int(1)

In [111]:
sap_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_street_name_type,ord_stname_concat,shortest_added_path,n_discontinuities,n_streets
0,8TH,AVE,8TH AVE,8TH AVE S,0,23,1
1,8TH,AVE,8TH AVE,,0,13,1
2,BELMONT,AVE,BELMONT AVE,,1,1,1
3,63RD,ST,63RD ST,NE 63RD ST,0,6,1
4,NORTH PARK,AVE,NORTH PARK AVE,NORTH PARK AVE N,0,4,1


In [112]:
recode_dict = {0:'shortest_is_false', 1:'shortest_is_true', }

In [113]:
sap_df['shortest_added_path'] = sap_df['shortest_added_path'].map(recode_dict)

In [114]:
sap_df = sap_df.sort_values(by = ['ord_street_name_type', 'ord_stname_concat'])

In [115]:
sap_df['connection_status'] = "intraconnection"
sap_df.loc[sap_df['ord_stname_concat'] == '', 'connection_status'] = 'interconnection'

In [117]:
sap_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_street_name_type,ord_stname_concat,shortest_added_path,n_discontinuities,n_streets,connection_status
400,100TH,ST,100TH ST,NE 100TH ST,shortest_is_false,8,1,intraconnection
399,100TH,ST,100TH ST,SW 100TH ST,shortest_is_false,5,1,intraconnection
401,101ST,ST,101ST ST,,shortest_is_false,2,1,interconnection
402,101ST,ST,101ST ST,NW 101ST ST,shortest_is_true,1,1,intraconnection
57,102ND,ST,102ND ST,NE 102ND ST,shortest_is_false,4,1,intraconnection


In [118]:
sap_df_ct_count = pd.pivot_table(data = sap_df, values = 'n_streets',
                                 index = ['connection_status', 'n_discontinuities'], columns = 'shortest_added_path',
                                 aggfunc='sum', fill_value=0, margins = True).reset_index(drop=False)

In [119]:
sap_df_ct_count.columns

Index(['connection_status', 'n_discontinuities', 'shortest_is_false',
       'shortest_is_true', 'All'],
      dtype='object', name='shortest_added_path')

In [120]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,All
0,interconnection,1,0,12,12
1,interconnection,2,9,2,11
2,interconnection,3,3,0,3
3,interconnection,4,8,0,8
4,interconnection,5,5,0,5


In [121]:
for recode_key, recode_value in recode_dict.items():
    if recode_value not in sap_df_ct_count.columns:
        sap_df_ct_count[recode_value] = [0] * sap_df_ct_count.shape[0]

In [122]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,All
0,interconnection,1,0,12,12
1,interconnection,2,9,2,11
2,interconnection,3,3,0,3
3,interconnection,4,8,0,8
4,interconnection,5,5,0,5


In [123]:
sap_df_ct_count = sap_df_ct_count.rename(columns = {'All':'n_streets'})
col_names = ['connection_status', 'n_discontinuities', 'shortest_is_false', 'shortest_is_true', 'n_streets']
sap_df_ct_count = sap_df_ct_count[col_names]

In [124]:
for cn in ['shortest_is_false', 'shortest_is_true']:
    ncn = cn + '_per'
    sap_df_ct_count[ncn] = sap_df_ct_count[cn] / sap_df_ct_count['n_streets']

In [125]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,n_streets,shortest_is_false_per,shortest_is_true_per
0,interconnection,1,0,12,12,0.0,1.0
1,interconnection,2,9,2,11,0.818182,0.181818
2,interconnection,3,3,0,3,1.0,0.0
3,interconnection,4,8,0,8,1.0,0.0
4,interconnection,5,5,0,5,1.0,0.0


In [127]:
# export this to excel
if not os.path.exists(rc.ANALYSIS_OUTPUT_FILE_PATH):
    os.makedirs(rc.ANALYSIS_OUTPUT_FILE_PATH)
ofpn = os.path.join(rc.ANALYSIS_OUTPUT_FILE_PATH, rc.S03_SS_COUNT_OUT_FILE_NAME)
print(ofpn)

sap_df_ct_count.to_excel(excel_writer=ofpn, sheet_name='shortest_segment_count', index = False)

../../../project/seattle_streets/data\analysis\shortest_segment_count.xlsx


# add ranking by street type to help with visualization

In [129]:
st_type_df = ms_gdf.loc[ms_gdf['street_status'].isin((2,3)), 'ord_street_type'].value_counts().to_frame(name = 'n_segments').reset_index()

In [130]:
st_type_df.head()

Unnamed: 0,ord_street_type,n_segments
0,ST,1960
1,AVE,1921
2,PL,82
3,WAY,20
4,DR,16


In [131]:
st_type_df['segment_rank'] = st_type_df['n_segments'].rank(ascending = True).astype(int).astype(str).str.zfill(2)

In [132]:
st_type_df.head(n=20)

Unnamed: 0,ord_street_type,n_segments,segment_rank
0,ST,1960,10
1,AVE,1921,9
2,PL,82,8
3,WAY,20,7
4,DR,16,6
5,BLVD,10,5
6,LN,9,4
7,RD,6,3
8,CT,2,2
9,PKWY,1,1


In [133]:
st_type_df['ord_street_type_rank'] = st_type_df['segment_rank'] + '_' + st_type_df['ord_street_type']

In [134]:
st_rank_dict = {ost:ostr for ost, ostr in zip(st_type_df['ord_street_type'], st_type_df['ord_street_type_rank'])}

In [135]:
ms_gdf['ord_street_type_rank'] = ms_gdf['ord_street_type'].map(st_rank_dict)

In [136]:
ms_gdf.loc[ms_gdf['ord_street_type_rank'].isna(), 'ord_street_type_rank'] = ""

In [137]:
ms_gdf.columns

Index(['snd_id', 'ord_street_name', 'ord_street_type', 'ord_street_name_type',
       'ord_stname_concat', 'snd_group', 'sn_id', 'en_id', 'dist',
       'city_portion', 'geometry', 'street_status', 'dist_miles',
       'ord_street_type_rank'],
      dtype='object')

In [138]:
# reorder columns
col_names = ['snd_id', 'ord_street_name','ord_stname_concat','ord_street_name_type',
             'ord_street_type', 'city_portion', 'ord_street_type_rank', 'snd_group',
               'street_status','sn_id','en_id','dist','dist_miles','geometry']

In [139]:
ms_gdf = ms_gdf[col_names]

# save the geodataframe with the continuous, discontinuous, and added streets

In [140]:
write_gdf(gdf = ms_gdf, output_file_path= rc.OUTPUT_FILE_PATH,
          output_file_name = rc.S03_MISSING_OUT_FILE_NAME)