# Part 04: Find the discontinuities in Seattle's streets
michael babb  
2025 02 18

In [1]:
# standard
import os

In [2]:
# external
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point

In [3]:
# custom
import run_constants as rc
from utils import *
from geo_data_io.df_operations import get_a_set, hey_what_is_na, split_col_values
from geo_data_io.fc_df_spatial import write_gdf
from geo_data_io.geo_operations import points2distance


# load the working seattle street network data

In [4]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S04_SND_WORKING_IN_FILE_NAME)

In [5]:
gdf = gpd.read_file(filename = fpn)

In [6]:
gdf.shape

(25771, 44)

# load the node data

In [7]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S04_NODE_IN_FILE_NAME)

In [8]:
node_gdf = gpd.read_file(filename = fpn)

In [9]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,17740,POINT (-122.32287 47.52982)
1,11391,POINT (-122.32402 47.61849)
2,982,POINT (-122.29193 47.7649)
3,11695,POINT (-122.30782 47.61411)
4,6257,POINT (-122.38214 47.67456)


In [10]:
node_gdf['node_id'] = node_gdf['node_id'].astype(int)

In [11]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [12]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,17740,POINT (-122.32287 47.52982),"(-122.32287444824671, 47.529819957875)"
1,11391,POINT (-122.32402 47.61849),"(-122.32401964037119, 47.61848906651073)"
2,982,POINT (-122.29193 47.7649),"(-122.29192836110747, 47.764904310419276)"
3,11695,POINT (-122.30782 47.61411),"(-122.30781531879083, 47.61410595110372)"
4,6257,POINT (-122.38214 47.67456),"(-122.38214200036303, 47.67455801597557)"


In [13]:
node_gdf.dtypes

node_id        int64
geometry    geometry
coords        object
dtype: object

In [14]:
# zap the node id and the coords into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# find missing segments

In [15]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_sector',
       'ord_stname_unique', 'group_id', 'sort_order', 'ord_stname_type',
       'city_sector_group', 'ord_stname_type_group', 'geometry'],
      dtype='object')

In [16]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [17]:
# only select column names used in subsequent steps
col_names = ['f_intr_id',
't_intr_id',
'snd_id',
'snd_feacode',
'gis_seg_length',
'ord_street_name',
'ord_street_type',
'ord_stname_type',
'ord_stname_concat',
'ord_stname_unique', 
'ord_stname_type_group',
'city_sector',
'city_sector_group',
'weight',
'group_id',
'sort_order',
'geometry']

In [18]:
gdf = gdf[col_names].copy()

In [19]:
# what's the current max snd ID?
gdf['snd_id'].max()

50338

In [20]:
# what's 2^16? start adding IDs at this value?
2**16

65536

In [21]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'gis_seg_length',
       'ord_street_name', 'ord_street_type', 'ord_stname_type',
       'ord_stname_concat', 'ord_stname_unique', 'ord_stname_type_group',
       'city_sector', 'city_sector_group', 'weight', 'group_id', 'sort_order',
       'geometry'],
      dtype='object')

In [22]:
testo = gdf.loc[gdf['ord_street_name'] == 'NEBRASKA', :]

In [23]:
testo.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacode,gis_seg_length,ord_street_name,ord_street_type,ord_stname_type,ord_stname_concat,ord_stname_unique,ord_stname_type_group,city_sector,city_sector_group,weight,group_id,sort_order,geometry
21590,7667,7707,15592,1,150.029545,NEBRASKA,ST,NEBRASKA ST,NEBRASKA ST,NEBRASKA ST W,NEBRASKA ST W_S,W,W_S,150.029545,1,NEBRASKA,"LINESTRING (-122.41747 47.66069, -122.41745 47..."
21591,7707,7745,15602,1,292.030442,NEBRASKA,ST,NEBRASKA ST,NEBRASKA ST,NEBRASKA ST W,NEBRASKA ST W_S,W,W_S,292.030442,1,NEBRASKA,"LINESTRING (-122.41721 47.66034, -122.4161 47...."
21592,7777,7745,15617,1,236.185868,NEBRASKA,ST,NEBRASKA ST,NEBRASKA ST,NEBRASKA ST W,NEBRASKA ST W_S,W,W_S,236.185868,1,NEBRASKA,"LINESTRING (-122.4152 47.65985, -122.4161 47.6..."
21593,16353,16302,32147,1,973.211903,NEBRASKA,ST,NEBRASKA ST,S NEBRASKA ST,S NEBRASKA ST,NEBRASKA ST W_S,S,W_S,973.211903,1,NEBRASKA,"LINESTRING (-122.32104 47.5489, -122.31899 47...."
21594,7707,7706,45368,0,16.710732,NEBRASKA,ST,NEBRASKA ST,NEBRASKA ST,NEBRASKA ST W,NEBRASKA ST W_S,W,W_S,16.710732,1,NEBRASKA,"LINESTRING (-122.41721 47.66034, -122.41715 47..."


In [24]:
'' in gdf['ord_stname_type_group'].unique().tolist()

False

In [25]:
testo = gdf.loc[gdf['ord_stname_type_group'].str.find('HOLMAN') > -1, :]

In [26]:
testo['ord_stname_type_group'].unique()

array(['HOLMAN RD N_NW'], dtype=object)

In [27]:
49727 in testo['snd_id'].tolist()

True

In [28]:
# here is where the discontinuities are identified and the "missing" segments are created.
# a full run, without saving intermediate data, takes less than a minute.
# saving intermediate data takes an additional ~5 minutes.

write_intermediate = False
use_v1 = False

# street_status
# 0: continuous street
# 1: discontinuous
# 2: added street: intra-connection
# 3: added street: inter-connection

# hold the output
output_gdf_list = []
shortest_added_path = []
gdf['snd_group'] = int(0)
snd_group_count = -1
# use this value to start the ID number for the added segements.
temp_snd_id = 2**16
# get the list of unique names
osntg_list = gdf['ord_stname_type_group'].unique().tolist()

# uncomment to test specifc street
#osntg_list = ['7TH AVE N', 'W GALER ST']
#osntg_list = ['GALER ST']
#osntg_list = ['15TH AVE W']
#osntg_list = ['WOODLAWN AVE N']
#osntg_list = ['92ND ST NW_N_NE']

#osntg_list = ['HOLMAN RD N_NW']

# the trouble_list, for lack of a better name, are streets with parallel edges
# we can record the names of these streets. 
trouble_list = []

for osntg in osntg_list[:None]:
    print(osntg)
    
    # subset the gdf by ord_stname_concat
    sub_gdf = gdf.loc[gdf['ord_stname_type_group'] == osntg, :].copy().reset_index(drop = True)
    unique_id_list = sub_gdf['group_id'].unique().tolist()
    
    for gi in unique_id_list:
        
        temp_gdf = sub_gdf.loc[sub_gdf['group_id'] == gi, :].copy()
        
        # create the graph for a single street.
        # For example, all streets named W GALER ST are a single graph.
        # this is an undirected graph.
        fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
        
        if temp_gdf.shape[0] != fg.number_of_edges():
            trouble_list.append(osntg)              
            
            # subsequent steps need a single-edge graph. 
            # Parallel edges are not allowed. 
            # so, let's return the edges last added to the graph.
            mod_edge_list = []
            for ed in fg.edges.data():
                mod_edge_list.append(ed[2]['snd_id'])
            temp_gdf = temp_gdf.loc[temp_gdf['snd_id'].isin(mod_edge_list), :]
            
        # get the list of nodes in the graph.
        node_list = list(fg.nodes)
        
        # a list of the snd groups - this is necessary to match street segments to nodes
        # nx.connected_components() identifies disconnected graphs
        for cc in nx.connected_components(G = fg):
            # cc is the sub graph
            sub_graph = fg.subgraph(cc).copy()
            # this is the edge data        
            edge_data_list = sub_graph.edges.data()        
            # this marks the street(s) in each sub graph
            snd_id_list = []
            for edl in edge_data_list:                        
                snd_id_list.append(edl[2]['snd_id'])        
            snd_group_count += 1
            # this will update the gdf per street name with the groups of streets
            temp_gdf.loc[temp_gdf['snd_id'].isin(snd_id_list), 'snd_group'] = snd_group_count             
        
        # this is the number of unique snd_groups
        # also the number of discontinuities
        snd_group_id_list = temp_gdf['snd_group'].unique()    
        
        n_discontinuities = len(snd_group_id_list)    
        print(n_discontinuities)
        
        if n_discontinuities > 1:

            # process for the sub graphs
            node_snd_group_dict = {}
            # a node - an intersection - can have more than one street.
            # we need to update the node snd group dict for every street
            # identify each edge that each node is on
            for ir, row in temp_gdf.iterrows():       
                fn = row['f_intr_id']
                tn = row['t_intr_id']
                #osn = row['ord_street_name']
                #osnt = row['ord_stname_type']
                #osnu = row['ord_stname_concat']

                snd_group_id = row['snd_group']
                node_snd_group_dict[fn] = snd_group_id
                node_snd_group_dict[tn] = snd_group_id                                             
            
            # create a list of available edges - these are the missing segments
            # these are formed from the non-edges of the graph.        
            avail_edges = []

            # compute the degree centrality 
            node_centrality_dict = nx.degree_centrality(G = fg)

            # dictionary to hold available edges
            # let's only make the distance calculation once, yeah?
            # it's a simple calculation, but even more simple to store it. 
            node_dist_dict = {}
            # enumerate the non-edges
            non_edge_count = 0
            # the total number of non-edges
            diff_non_edge_count = 0
            # the number of non-edges that connect disconnected components
            for ne in nx.non_edges(graph = fg):    
                non_edge_count += 1
                # ne is a tuple of from / to nodes.
                # create available edges if the nodes are not on the same segment
                # this will decrease the potential solution space
                if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                    diff_non_edge_count += 1
                    # calculate the straight-line distance between two nodes. 
                    # convert to feet to match the existing distance / weight variable
                    weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                    
                    # build the output tuple
                    if use_v1:
                        # this does not use node centrality
                        output = (ne[0], ne[1], {'weight':weight})
                    else:
                        # let's get the node centrality and use its inverse to 
                        # further weight distances                     
                        ne_0_nc = node_centrality_dict[ne[0]]
                        ne_1_nc = node_centrality_dict[ne[1]]
                        edge_weight = weight * (ne_0_nc * ne_1_nc)
                        output = (ne[0], ne[1], {'weight':edge_weight})                        
                    # add to the distance dict
                    node_dist_dict[(ne[0], ne[1])] = weight 
                    node_dist_dict[(ne[1], ne[0])] = weight 
                    avail_edges.append(output)

            # we just calculated all of the potential segments that span discontinuities
            # nx.k_edge_augmentation() creates the potential edges in a graph by adding as
            # few edges as possible. In a street network, with multiple disconnected
            # components, there is really one way to minimally connect the disparate components
            # to create full connectivity. And it's the shortest geographic segment in this case!
            # weighted graph traversal works by accumulating as little weight as possible. 
            
            # this will store the output
            data_list = []
            line_list = []
            # once nx.k_edge_augmentation() finishes, it returns a generator with the added edges that
            # ensure complete connectivity between all nodes.
            augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
            # enumerate the augmented edges
            for i_ae, ae in enumerate(augmented_edges):                
                # unpack
                # fn: from node, tn: to node
                fn, tn = ae            
                # get the weight / distance of the added edge
                weight = node_dist_dict[(fn, tn)] 
                # this is the output dictionary
                # street name, integer indicating the snd_group, from node, to node, distance of the edge

                # get node data: from nodes
                fn_node_list = []                
                for curr_edge in fg.edges(fn):
                    edge_dict = fg.edges[curr_edge]
                    fn_node_list.append(edge_dict)
                
                fn_df = pd.DataFrame(data = fn_node_list)
                fn_osnu = fn_df['ord_stname_unique']
                fn_osnc = fn_df['ord_stname_concat']
                fn_cp = fn_df['city_sector']

                # get node data: to nodes
                tn_node_list = []
                for curr_edge in fg.edges(tn):
                    edge_dict = fg.edges[curr_edge]
                    tn_node_list.append(edge_dict)
                
                tn_df = pd.DataFrame(data = tn_node_list)
                tn_osnu = tn_df['ord_stname_unique']
                tn_osnc = tn_df['ord_stname_concat']
                tn_cp = tn_df['city_sector']

                # ord_stname_concat: determine this by computing the set intersection
                osnu_set = set(fn_osnu).intersection(tn_osnu)
                osnc_set = set(fn_osnc).intersection(tn_osnc)
                cp_set = set(fn_cp).intersection(tn_cp)
                
                if len(osnu_set) == 1:
                    # this means that the segments are on the same street
                    osnu = osnu_set.pop()
                    osnc = osnc_set.pop()
                    cp = cp_set.pop()
                    
                else:
                    # the segments
                    osnu = ''
                    osnc = ''
                    cp = '_'.join(sorted(temp_gdf['city_sector'].unique().tolist()))

                # ord_street_name
                osn = temp_gdf['ord_street_name'].unique().tolist()[0]
                # ord_street_type
                ost = temp_gdf['ord_street_type'].unique().tolist()[0]
                # ord_stname_type
                osnt = temp_gdf['ord_stname_type'].unique().tolist()[0]                                                
                
                # ostntg
                # this is calculated from the osntg - a part of the loop
                
                
                # city_sector group
                cpg = temp_gdf['city_sector_group'].unique().tolist()[0]

                # order of the columns to add:
                # ord_street_name: osn
                # ord_street_type: ost
                # ord_stname_type: osnt
                # ord_stname_concat: osnc
                # ord_stname_unique: osnu
                # ord_stname_type_group: osntg
                # weight                
                # city_sector
                # city_sector_group

                temp_snd_id += 1

                temp_data_list = [temp_snd_id, osn, ost, osnt, osnc, osnu, osntg, i_ae, fn, tn, weight, cp, cpg, gi]
                # now, let's create some geometry
                temp_line = LineString([node_dict[fn], node_dict[tn]])
                line_list.append(temp_line)
                data_list.append(temp_data_list)

            # compare the list of available edges to the list of chosen edges
            # this isn't necessary to complete the edge augmentation, but it's
            # nice to know how often added edges are always the shortest edges        
            av_df = pd.DataFrame(data = avail_edges, columns = ['sn_id', 'en_id', 'weight_dict'])
            #av_df['dist'] = av_df['weight_dict'].map(lambda x: x['weight'])

            av_df['dist'] = av_df.apply(func=lambda x: node_dist_dict[(x['sn_id'], x['en_id'])], axis = 1)
            av_df = av_df.drop(labels = 'weight_dict', axis = 1)
            # rank distance - the lower the rank, the shorter the segment.
            av_df['dist_rank'] = av_df['dist'].rank(method = 'dense')

            # ord_street_name: osn
            # ord_street_type: ost
            # ord_stname_type: osnt
            # ord_stname_concat: osnc
            # ord_stname_unique: osnu
            # ord_stname_type_group: osntg
            # weight                
            # city_sector
            # city_sector_group
            
            # build a gpd.GeoDataFrame - these are the "missing" segments"
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type',
                         'ord_stname_type', 'ord_stname_concat', 
                         'ord_stname_unique', 'ord_stname_type_group',
                         'snd_group', 'sn_id', 'en_id', 'dist', 'city_sector',
                         'city_sector_group', 'group_id']
            
            ms_gdf = gpd.GeoDataFrame(data = data_list,
                                            columns = col_names,
                                            geometry = line_list, crs = 'epsg:4326')
            
            # these edges are not on the same street group
            ms_gdf['street_status'] = int(2)
            
            # update where the connection spans city sectors     
            ms_gdf.loc[ms_gdf['ord_stname_unique'] == '', 'street_status'] = 3

            # but sometimes, there will be instances where it's the same street, but
            # only by direction prefix / suffix          
            cp_set = get_a_set(cn = ms_gdf['city_sector'])
            cpg_set = split_col_values(cn = ms_gdf['city_sector_group'])

            if len(cp_set) == 1 and len(cpg_set) == 1:
                ms_gdf.loc[ms_gdf['ord_stname_unique'] == '', 'street_status'] = 2                

            # this will track if there are any graphs with parallel edges.
            if ms_gdf.empty:
                trouble_list.append(osntg)            
            
            # select where the rank is LTE than the number of records in the missing segement gdf
            av_df = av_df.loc[av_df['dist_rank'] <= ms_gdf.shape[0], :]
            
            # if the sum of the distance of the added segments is the same, then only the
            # shortest segments were added. If not, then other, longer, segments were added
            # but that makes for less total distance traversed in the graph.
            # So, how often does that happen?
            shortest_tot_path = av_df['dist'].sum() == ms_gdf['dist'].sum()
            shortest_tot_path = int(shortest_tot_path)
            shortest_added_path.append([osn, ost, osnt, osnc, osnu, osntg, shortest_tot_path, n_discontinuities - 1])        

            # now, let's update where the ord_stname_concat variable and the 
            # ord_stname_unique variable with the ord_stname_type_group variable
            # when ord_stname_concat and ord_stname_unique are null
            col_names = ['ord_stname_concat', 'ord_stname_unique']
            for cn in col_names:
                ms_gdf.loc[ms_gdf[cn] == '', cn] = ms_gdf.loc[ms_gdf[cn] == '', 'ord_stname_type_group']

            # ord_street_name: osn
            # ord_street_type: ost
            # ord_stname_type: osnt
            # ord_stname_concat: osnc
            # ord_stname_unique: osnu
            # ord_stname_type_group: osntg
            # weight                
            # city_sector
            # city_sector_group
            #                     
            # add the known, disconnected segments. These are the existing streets.
            col_names = ['snd_id', 'ord_street_name', 'ord_street_type',
                         'ord_stname_type', 'ord_stname_concat', 'ord_stname_unique', 
                         'ord_stname_type_group', 'snd_group',  'f_intr_id', 't_intr_id',
                         'gis_seg_length', 'city_sector', 'city_sector_group', 'group_id', 'geometry']                             

            ks_gdf = temp_gdf[col_names].copy()
            ks_gdf['street_status'] = int(1)
            ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

            # stack the geodataframes
            output_gdf = pd.concat([ms_gdf, ks_gdf])            
        
            # write intermediate
            if write_intermediate:
                # intermediate streets - existing streets
                is_output_file_path = os.path.join(rc.OUTPUT_FILE_PATH, 'individual_streets')
                if not os.path.exists(is_output_file_path):
                    os.makedirs(is_output_file_path)
                output_file_name = '_'.join(osntg.split()) + '.gpkg'
                write_gdf(gdf = temp_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)
            
                # intermediate nodes
                curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
                #curr_node_list = curr_node_df['node_id'].tolist()
                curr_node_df['node_centrality'] = curr_node_df['node_id'].map(node_centrality_dict)
                node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
                output_file_name = '_'.join(osntg.split()) + '_full_nodes.gpkg'
                write_gdf(gdf = node_subset_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

                # the added and continuous segments for a street
                output_file_name = '_'.join(osntg.split()) + '_missing_segments.gpkg'
                ofpn = os.path.join(is_output_file_path, output_file_name)                
                write_gdf(gdf = output_gdf, output_file_path = is_output_file_path, output_file_name = output_file_name)

        else:
            # gather the streets with no missing segments.         
            # ord_street_name: osn
            # ord_street_type: ost
            # ord_stname_type: osnt
            # ord_stname_concat: osnc
            # ord_stname_unique: osnu
            # ord_stname_type_group: osntg
            # weight                
            # city_sector
            # city_sector_group

            col_names = ['snd_id', 'ord_street_name', 'ord_street_type',
                         'ord_stname_type', 'ord_stname_concat', 'ord_stname_unique', 
                         'ord_stname_type_group', 'snd_group',  'f_intr_id',
                         't_intr_id', 'gis_seg_length', 'city_sector',
                         'city_sector_group', 'group_id', 'geometry']    

            output_gdf = temp_gdf[col_names].copy()
            output_gdf['street_status'] = int(0)
            output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # add to the output list
        output_gdf_list.append(output_gdf)

100TH PL NW
1
100TH PL SW
1
100TH ST NW_N_NE
9
100TH ST SW_S
6
101ST PL SW
1
101ST ST NW_N
2
101ST ST SW_S
3
102ND LN SW
1
102ND ST N_NE
5
102ND ST SW_S
7
103RD PL NE
1
103RD PL SW
1
103RD ST NW_N_NE
12
103RD ST SW_S
6
104TH LN S
1
104TH PL NE
1
104TH ST NW_N_NE
10
104TH ST SW_S
10
104TH WAY NE
1
105TH PL NE
2
105TH PL SW
1
105TH ST NW_N_NE
12
105TH ST SW_S
5
106TH PL NE
1
106TH ST NW_N_NE
10
106TH ST SW_S
8
107TH PL SW
1
107TH ST NW_N_NE
10
107TH ST SW_S
7
107TH WAY SW
1
108TH PL NE
1
108TH PL S
1
108TH PL SW
1
108TH ST NW_NE
3
108TH ST SW_S
7
109TH ST N_NE
3
109TH ST SW
5
10TH AVE NW_W_SW
11
10TH AVE NE_E_CNTR_S
16
10TH PL NE
3
10TH PL SW
1
10TH PL W
1
110TH CT S
1
110TH LN SW
2
110TH PL S
1
110TH PL SW
1
110TH ST NW_N_NE
3
110TH ST SW_S
9
111TH LN SW
1
111TH PL SW
2
111TH ST SW_S
4
112TH LN SW
1
112TH PL NW
1
112TH PL S
1
112TH PL SW
1
112TH ST NW_N_NE
7
112TH ST SW_S
7
113TH LN SW
1
113TH PL N
1
113TH PL NW
1
113TH ST N_NE
10
113TH ST SW_S
4
114TH PL NW
1
114TH ST NW_N_NE
5
114TH S

# combine data for output

In [29]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [30]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

In [31]:
# join in other street identification data

In [32]:
ms_gdf.shape

(29780, 17)

In [33]:
ms_gdf['street_status'].value_counts()

street_status
1    21931
0     3825
2     3614
3      410
Name: count, dtype: int64

In [34]:
ms_gdf.columns

Index(['snd_id', 'ord_street_name', 'ord_street_type', 'ord_stname_type',
       'ord_stname_concat', 'ord_stname_unique', 'ord_stname_type_group',
       'snd_group', 'sn_id', 'en_id', 'dist', 'city_sector',
       'city_sector_group', 'group_id', 'geometry', 'street_status',
       'dist_miles'],
      dtype='object')

In [35]:
# check for missing values
hey_what_is_na(ms_gdf)

In [36]:
ms_gdf['ord_stname_concat'].value_counts()

ord_stname_concat
M L KING JR WAY S      149
BEACON AVE S           138
RAINIER AVE S          130
AURORA AVE N           102
GREENWOOD AVE N         89
                      ... 
ALOHA ST W_CNTR_E        1
ALDERBROOK PL NW         1
N MENFORD PL             1
ALDER ST CNTR_E          1
CALIFORNIA AVE W_SW      1
Name: count, Length: 2768, dtype: int64

In [37]:
'' in ms_gdf['ord_stname_type_group'].unique().tolist()

False

In [38]:
write_gdf(gdf = ms_gdf, output_file_path=rc.OUTPUT_FILE_PATH, output_file_name='test_street.gpkg')

In [39]:
testo.shape

(16, 17)

In [40]:
ms_gdf.shape

(29780, 17)

In [41]:
# what streets have parallel edges?
# these can be verified / examined in qGIS
trouble_list

['1ST AVE NE_N_CNTR_S',
 '25TH AVE NW_W_SW',
 '40TH ST NW_N_NE',
 '43RD ST NW_N_NE',
 '7TH AVE NE_N_CNTR_S',
 'FOREST ST SW_S',
 'LAWTON CIR W',
 'MAGNOLIA BR W',
 'MERRILL LN NW',
 'RADFORD DR NE',
 'ROYAL BROUGHAM WAY S',
 'SEWARD PARK RD S',
 'SUMMIT AVE E_CNTR']

In [42]:
# is everything lining up?
testo = ms_gdf.loc[ms_gdf['ord_stname_unique'] != ms_gdf['ord_stname_concat'], :]

In [43]:
testo.head()

Unnamed: 0,snd_id,ord_street_name,ord_street_type,ord_stname_type,ord_stname_concat,ord_stname_unique,ord_stname_type_group,snd_group,sn_id,en_id,dist,city_sector,city_sector_group,group_id,geometry,street_status,dist_miles
5,65673,10TH,AVE,10TH AVE,10TH AVE,10TH AVE CNTR,10TH AVE NE_E_CNTR_S,5,11829,12162,2024.641807,CNTR,NE_E_CNTR_S,2,"LINESTRING (-122.31936 47.61176, -122.31943 47...",2,0.383455
6,65674,10TH,AVE,10TH AVE,10TH AVE,10TH AVE CNTR,10TH AVE NE_E_CNTR_S,6,12319,12359,152.428743,CNTR,NE_E_CNTR_S,2,"LINESTRING (-122.31942 47.60352, -122.31942 47...",2,0.028869
25,20435,10TH,AVE,10TH AVE,10TH AVE,10TH AVE CNTR,10TH AVE NE_E_CNTR_S,174,11683,11604,423.143814,CNTR,NE_E_CNTR_S,2,"LINESTRING (-122.31946 47.61409, -122.31948 47...",1,0.080141
26,20522,10TH,AVE,10TH AVE,10TH AVE,10TH AVE CNTR,10TH AVE NE_E_CNTR_S,174,11752,11683,417.719952,CNTR,NE_E_CNTR_S,2,"LINESTRING (-122.31945 47.61295, -122.31946 47...",1,0.079114
27,20601,10TH,AVE,10TH AVE,10TH AVE,10TH AVE CNTR,10TH AVE NE_E_CNTR_S,174,11809,11752,278.657501,CNTR,NE_E_CNTR_S,2,"LINESTRING (-122.31938 47.61218, -122.31939 47...",1,0.052776


In [44]:
testo.shape

(1830, 17)

In [45]:
testo['street_status'].value_counts()

street_status
1    1358
0     364
2     108
Name: count, dtype: int64

In [46]:
testo['ord_stname_concat'].value_counts()

ord_stname_concat
DENNY WAY                  30
4TH AVE                    29
MERCER ST                  26
2ND AVE                    26
5TH AVE                    25
                           ..
QUEEN ANNE AVE              1
RAVENNA BV WB ACCESS RD     1
BAY TERRACE RD              1
SMITH PL                    1
DEXTER AVE                  1
Name: count, Length: 203, dtype: int64

# compute if the added segments were indeed the shortest.

In [47]:
# ord_street_name: osn
# ord_street_type: ost
# ord_stname_type: osnt
# ord_stname_concat: osnc
# ord_stname_unique: osnu
# ord_stname_type_group: osntg
# weight                
# city_sector
# city_sector_group

In [48]:
# the number of times when the shortest segment is the shortest
col_names = ['ord_street_name', 'ord_street_type', 'ord_stname_type', 'ord_stname_concat',
             'ord_stname_unique', 'ord_stname_type_group', 'shortest_added_path',
             'n_discontinuities']                                      
sap_df = pd.DataFrame(data = shortest_added_path, columns = col_names)

In [49]:
sap_df['n_streets'] = int(1)

In [50]:
sap_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_stname_type,ord_stname_concat,ord_stname_unique,ord_stname_type_group,shortest_added_path,n_discontinuities,n_streets
0,100TH,ST,100TH ST,NE 100TH ST,NE 100TH ST,100TH ST NW_N_NE,0,8,1
1,100TH,ST,100TH ST,S 100TH ST,S 100TH ST,100TH ST SW_S,0,5,1
2,101ST,ST,101ST ST,NW 101ST ST,NW 101ST ST,101ST ST NW_N,1,1,1
3,101ST,ST,101ST ST,S 101ST ST,S 101ST ST,101ST ST SW_S,0,2,1
4,102ND,ST,102ND ST,NE 102ND ST,NE 102ND ST,102ND ST N_NE,0,4,1


In [51]:
sap_df['ord_stname_type_group'].unique().shape

(699,)

In [52]:
recode_dict = {0:'shortest_is_false', 1:'shortest_is_true', }

In [53]:
sap_df['shortest_added_path'] = sap_df['shortest_added_path'].map(recode_dict)

In [54]:
sap_df = sap_df.sort_values(by = ['ord_stname_type', 'ord_stname_concat'])

In [55]:
sap_df['connection_status'] = "intraconnection"
sap_df.loc[sap_df['ord_stname_concat'] == '', 'connection_status'] = 'interconnection'

In [56]:
sap_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_stname_type,ord_stname_concat,ord_stname_unique,ord_stname_type_group,shortest_added_path,n_discontinuities,n_streets,connection_status
0,100TH,ST,100TH ST,NE 100TH ST,NE 100TH ST,100TH ST NW_N_NE,shortest_is_false,8,1,intraconnection
1,100TH,ST,100TH ST,S 100TH ST,S 100TH ST,100TH ST SW_S,shortest_is_false,5,1,intraconnection
2,101ST,ST,101ST ST,NW 101ST ST,NW 101ST ST,101ST ST NW_N,shortest_is_true,1,1,intraconnection
3,101ST,ST,101ST ST,S 101ST ST,S 101ST ST,101ST ST SW_S,shortest_is_false,2,1,intraconnection
4,102ND,ST,102ND ST,NE 102ND ST,NE 102ND ST,102ND ST N_NE,shortest_is_false,4,1,intraconnection


In [57]:
sap_df_ct_count = pd.pivot_table(data = sap_df, values = 'n_streets',
                                 index = ['connection_status', 'n_discontinuities'], columns = 'shortest_added_path',
                                 aggfunc='sum', fill_value=0, margins = True).reset_index(drop=False)

In [58]:
sap_df_ct_count.columns

Index(['connection_status', 'n_discontinuities', 'shortest_is_false',
       'shortest_is_true', 'All'],
      dtype='object', name='shortest_added_path')

In [59]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,All
0,interconnection,1,0,11,11
1,interconnection,2,8,1,9
2,interconnection,3,3,0,3
3,interconnection,4,4,0,4
4,interconnection,5,1,0,1


In [60]:
for recode_key, recode_value in recode_dict.items():
    if recode_value not in sap_df_ct_count.columns:
        sap_df_ct_count[recode_value] = [0] * sap_df_ct_count.shape[0]

In [61]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,All
0,interconnection,1,0,11,11
1,interconnection,2,8,1,9
2,interconnection,3,3,0,3
3,interconnection,4,4,0,4
4,interconnection,5,1,0,1


In [62]:
sap_df_ct_count = sap_df_ct_count.rename(columns = {'All':'n_streets'})
col_names = ['connection_status', 'n_discontinuities', 'shortest_is_false', 'shortest_is_true', 'n_streets']
sap_df_ct_count = sap_df_ct_count[col_names]

In [63]:
for cn in ['shortest_is_false', 'shortest_is_true']:
    ncn = cn + '_per'
    sap_df_ct_count[ncn] = sap_df_ct_count[cn] / sap_df_ct_count['n_streets']

In [64]:
sap_df_ct_count.head()

shortest_added_path,connection_status,n_discontinuities,shortest_is_false,shortest_is_true,n_streets,shortest_is_false_per,shortest_is_true_per
0,interconnection,1,0,11,11,0.0,1.0
1,interconnection,2,8,1,9,0.888889,0.111111
2,interconnection,3,3,0,3,1.0,0.0
3,interconnection,4,4,0,4,1.0,0.0
4,interconnection,5,1,0,1,1.0,0.0


In [65]:
# export this to excel
if not os.path.exists(rc.ANALYSIS_OUTPUT_FILE_PATH):
    os.makedirs(rc.ANALYSIS_OUTPUT_FILE_PATH)
ofpn = os.path.join(rc.ANALYSIS_OUTPUT_FILE_PATH, rc.S04_SS_COUNT_OUT_FILE_NAME)
print(ofpn)

sap_df_ct_count.to_excel(excel_writer=ofpn, sheet_name='shortest_segment_count', index = False)

../../../project/seattle_streets/data\analysis\shortest_segment_count.xlsx


# add ranking by street type to help with visualization

In [66]:
st_type_df = ms_gdf.loc[ms_gdf['street_status'].isin((2,3)), 'ord_street_type'].value_counts().to_frame(name = 'n_segments').reset_index()

In [67]:
st_type_df.head()

Unnamed: 0,ord_street_type,n_segments
0,ST,1960
1,AVE,1919
2,PL,81
3,WAY,20
4,DR,16


In [68]:
st_type_df['segment_rank'] = st_type_df['n_segments'].rank(ascending = True).astype(int).astype(str).str.zfill(2)

In [69]:
st_type_df.head(n=20)

Unnamed: 0,ord_street_type,n_segments,segment_rank
0,ST,1960,10
1,AVE,1919,9
2,PL,81,8
3,WAY,20,7
4,DR,16,6
5,BLVD,10,5
6,LN,9,4
7,RD,6,3
8,CT,2,2
9,PKWY,1,1


In [70]:
st_type_df['ord_street_type_rank'] = st_type_df['segment_rank'] + '_' + st_type_df['ord_street_type']

In [71]:
st_rank_dict = {ost:ostr for ost, ostr in zip(st_type_df['ord_street_type'], st_type_df['ord_street_type_rank'])}

In [72]:
ms_gdf['ord_street_type_rank'] = ms_gdf['ord_street_type'].map(st_rank_dict)

In [73]:
ms_gdf.loc[ms_gdf['ord_street_type_rank'].isna(), 'ord_street_type_rank'] = ""

In [74]:
ms_gdf.columns

Index(['snd_id', 'ord_street_name', 'ord_street_type', 'ord_stname_type',
       'ord_stname_concat', 'ord_stname_unique', 'ord_stname_type_group',
       'snd_group', 'sn_id', 'en_id', 'dist', 'city_sector',
       'city_sector_group', 'group_id', 'geometry', 'street_status',
       'dist_miles', 'ord_street_type_rank'],
      dtype='object')

In [75]:
# ord_street_name: osn
# ord_street_type: ost
# ord_stname_type: osnt
# ord_stname_concat: osnc
# ord_stname_unique: osnu
# ord_stname_type_group: osntg
# weight                
# city_sector
# city_sector_group

In [76]:
# reorder columns
col_names = ['snd_id', 'ord_street_name', 'ord_street_type','ord_stname_type',
             'ord_stname_concat', 'ord_stname_unique', 'ord_stname_type_group',
             'ord_street_type_rank','snd_group', 'group_id', 'city_sector',
             'city_sector_group', 'street_status', 'sn_id','en_id',
             'dist','dist_miles','geometry']

In [77]:
ms_gdf = ms_gdf[col_names]

# save the geodataframe with the continuous, discontinuous, and added streets

In [78]:
write_gdf(gdf = ms_gdf, output_file_path= rc.OUTPUT_FILE_PATH,
          output_file_name = rc.S04_MISSING_OUT_FILE_NAME)

In [79]:
'' in ms_gdf['ord_stname_type_group'].unique().tolist()

False