In [1]:
# mike babb
# 2024 06 28
# what streets start and stop?

In [2]:
# standard
import os

In [3]:
# external
from itertools import combinations, product
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from shapely import line_merge

In [4]:
# custom
from geodataio.geo_operations import points2distance, calculate_initial_compass_bearing
from utils import *

# load the working seattle street network data

In [5]:
# file path
input_file_path = 'H:/project/seattle_streets/data/' 
output_file_path = 'H:/project/seattle_streets/data/individual_streets'

In [6]:
file_name = 'Street_Network_Database_Seattle_working.gpkg'

In [7]:
fpn = os.path.join(input_file_path, file_name)

In [8]:
gdf = gpd.read_file(filename = fpn)

In [9]:
col_names = ['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode']
for cn in col_names:
    gdf[cn] = gdf[cn].astype(np.int64)

In [10]:
gdf.dtypes

f_intr_id                          int64
t_intr_id                          int64
snd_id                             int64
snd_feacode                        int64
citycode                           int64
stname_id                          int64
st_code                            int64
arterial_code                      int64
segment_type                       int64
agency_code                        int64
access_code                        int64
divided_code                       int64
structure_type                     int64
legalloc_code                      int64
vehicle_use_code                   int64
gis_seg_length                   float64
l_adrs_from                        int64
l_adrs_to                          int64
r_adrs_from                        int64
r_adrs_to                          int64
ord_pre_dir                       object
ord_street_name                   object
ord_street_type                   object
ord_suf_dir                       object
ord_stname_conca

# load the node data

In [11]:
input_file_name = 'Street_Network_Nodes.gpkg'

In [12]:
fpn = os.path.join(input_file_path, input_file_name)

In [13]:
node_gdf = gpd.read_file(filename = fpn)

In [14]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,17740,POINT (-122.32287 47.52982)
1,11391,POINT (-122.32402 47.61849)
2,982,POINT (-122.29193 47.7649)
3,11695,POINT (-122.30782 47.61411)
4,6257,POINT (-122.38214 47.67456)


In [15]:
node_gdf['node_id'] = node_gdf['node_id'].astype(int)

In [16]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [17]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,17740,POINT (-122.32287 47.52982),"(-122.32287444824671, 47.529819957875)"
1,11391,POINT (-122.32402 47.61849),"(-122.32401964037119, 47.61848906651073)"
2,982,POINT (-122.29193 47.7649),"(-122.29192836110747, 47.764904310419276)"
3,11695,POINT (-122.30782 47.61411),"(-122.30781531879083, 47.61410595110372)"
4,6257,POINT (-122.38214 47.67456),"(-122.38214200036303, 47.67455801597557)"


In [18]:
node_gdf.dtypes

node_id        int32
geometry    geometry
coords        object
dtype: object

In [19]:
# zap this into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# FIND MISSING SEGMENTS

In [20]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [21]:
# ONLY SELECT COLUMNS NAMES USED IN SUBSEQUENT STEPS

In [22]:
col_names = ['f_intr_id',
't_intr_id',
'snd_id',
'snd_feacode',
'gis_seg_length',
'ord_street_name',
'ord_street_type',
'ord_stname_concat',
'city_portion',
'geometry',
'weight']

In [23]:
gdf = gdf[col_names].copy()

In [32]:
# save intermediate data for checking?
write_intermediate = False

# street_status
# 0: complete street
# 1: disconnected
# 2: missing street

# hold the output
output_gdf_list = []
shortest_added_path = []
gdf['snd_group'] = int(0)
snd_group_count = -1
temp_snd_id = 2**16
# get the list of unique names
s_name = gdf['ord_stname_concat'].unique().tolist()
#s_name = ['7TH AVE N', 'W GALER ST']
#s_name = ['W GALER ST']
#s_name = ['N 43RD ST']
trouble_list = []

for sn in s_name[:None]:
    print(sn)
    
    # subset the gdf by streetname
    temp_gdf = gdf.loc[gdf['ord_stname_concat'] == sn, :].copy().reset_index(drop = True)
    
    # create the graph for a single street. For example, all streets W GALER ST are a single graph
    fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
    
    if temp_gdf.shape[0] != fg.number_of_edges():    
        print(sn, temp_gdf.shape[0], fg.number_of_edges())
        # this is a single-edge graph. Parallel edges are not allowed
        # so, let's return the 
        mod_edge_list = []
        for ed in fg.edges.data():
            mod_edge_list.append(ed[2]['snd_id'])
        temp_gdf = temp_gdf.loc[temp_gdf['snd_id'].isin(mod_edge_list), :]
        
    
    node_list = list(fg.nodes)
    
    # a list of the snd groups - this is necessary to match street segments to nodes
    # nx.connected_components() identifies disconnected graphs
    for cc in nx.connected_components(G = fg):
        # cc is the sub graph
        # this is the edge data
        #sub_graph = G.subgraph(c).copy() for c in nx.connected_components(G)]
        sub_graph = fg.subgraph(cc).copy()
        edge_data_list = sub_graph.edges.data()        
        # this marks the street(s) in each sub graph
        snd_id_list = []
        for edl in edge_data_list:                        
            snd_id_list.append(edl[2]['snd_id'])
        
        snd_group_count += 1
        # this will update the gdf per street name with the groups of streets
        temp_gdf.loc[temp_gdf['snd_id'].isin(snd_id_list), 'snd_group'] = snd_group_count     
    
    snd_group_id_list = temp_gdf['snd_group'].unique()    
    #print(snd_group_id_list)
    n_discontinuities = len(snd_group_id_list)
    
    
    if n_discontinuities > 1:
        # process for separate sub graphs
        node_snd_group_dict = {}
        # a node - an intersection - can have more than one street.
        # we need to update the node snd group dict for every street
        for ir, row in temp_gdf.iterrows():       
            fn = row['f_intr_id']
            tn = row['t_intr_id']
            snd_group_id = row['snd_group']
            node_snd_group_dict[fn] = snd_group_id
            node_snd_group_dict[tn] = snd_group_id                                             
        
        # create a list of available edges - these are the missing segments
        # these are formed from the non-edges of the graph.        
        avail_edges = []
        # dictionary to hold available edges
        # let's only make the distance calculation once, yeah?
        # it's a simple calculation, but even more simple to store it. 
        node_dist_dict = {}
        # enumerate the non-edges
        non_edge_count = 0
        diff_non_edge_count = 0
        for ne in nx.non_edges(graph = fg):    
            non_edge_count += 1
            # ne is a tuple of from / to nodes.
            # create available edges if the nodes are not on the same segment
            # this will decrease the potential solution space
            if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                diff_non_edge_count += 1
                # calculate the straight-line distance between two nodes. 
                # convert to feet to match the existing distance / weight
                weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                # build the output tuple
                output = (ne[0], ne[1], {'weight':weight})
                # add to the distance dict
                node_dist_dict[(ne[0], ne[1])] = weight 
                node_dist_dict[(ne[1], ne[0])] = weight 
                avail_edges.append(output)

        # print the number of nodes, edges, possible edges, and optimal edges
        #print(fg.number_of_nodes(),fg.number_of_edges(), non_edge_count, diff_non_edge_count)
        # these are missing segments.
        # nx.k_edge_augmentation creates the missing edges in a graph by adding as
        # few edges as possible. In a street network, with multiple disconnected
        # components, there is really one way to minimally connect the disparate components
        # to create full connectivity. And it's the shortest geographic segment in this case!
        # weighted graph traversal works by accumulating as little weight as possible. 
        
        data_list = []
        line_list = []
        # once nx.k_edge_augmentation finishes, it returns a generator with the added edges that
        # ensure complete connectivity between all nodes.
        augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
        # enumerate
        for i_ae, ae in enumerate(augmented_edges):
            # unpack
            fn, tn = ae            
            # get the weight / distance of the added edge
            weight = node_dist_dict[(fn, tn)] 
            # this is the output dictionary
            # street name, integer indicating the snd_group, from node, to node, distance of the edge
            temp_snd_id += 1
            temp_data_list = [temp_snd_id, sn, i_ae, fn, tn, weight]
            # now, let's create some geometry
            temp_line = LineString([node_dict[fn], node_dict[tn]])
            line_list.append(temp_line)
            data_list.append(temp_data_list)

        # compare the list of available edges to the list of chosen edges
        av_df = pd.DataFrame(data = avail_edges, columns = ['sn_id', 'en_id', 'weight_dict'])
        av_df['dist'] = av_df['weight_dict'].map(lambda x: x['weight'])
        av_df = av_df.drop(labels = 'weight_dict', axis = 1)
        av_df['dist_rank'] = av_df['dist'].rank(method = 'dense')
        
        # build a gpd.GeoDataFrame - these are the "missing" segments"
        ms_gdf = gpd.GeoDataFrame(data = data_list,
                                         columns = ['snd_id', 'ord_stname_concat', 'snd_group', 'sn_id', 'en_id', 'dist'],
                                         geometry = line_list, crs = 'epsg:4326')
        # these edges are not on the same street group
        ms_gdf['street_status'] = int(2)
        if ms_gdf.empty:
            trouble_list.append(sn)
            print(sn)
        
        # create graphs to ensure same edge connectivity
        av_df = av_df.loc[av_df['dist_rank'] <= ms_gdf.shape[0], :]
        #print(av_df.shape, ms_gdf.shape)
        #print(av_df.head())
        #print(ms_gdf.head())
        shortest_tot_path = av_df['dist'].sum() == ms_gdf['dist'].sum()
        shortest_added_path.append([sn, shortest_tot_path, n_discontinuities - 1])
        
        #g_av = nx.from_pandas_edgelist(av_df
        
        # now, we need to get the known segments
        col_names = ['snd_id', 'ord_stname_concat', 'snd_group',  'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        ks_gdf = temp_gdf[col_names].copy()
        ks_gdf['street_status'] = int(1)
        ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # stack the geodataframes
        output_gdf = pd.concat([ms_gdf, ks_gdf])
    
        # write intermediate
        if write_intermediate:
            # intermediate streets
            output_file_name = '_'.join(sn.split()) + '.gpkg'
            write_gdf(gdf = temp_gdf, output_file_path = output_file_path, output_file_name = output_file_name)
        
            # intermediate nodes
            curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
            curr_node_list = curr_node_df['node_id'].tolist()
            node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
            output_file_name = 'full_nodes_' + '_'.join(sn.split()) + '.gpkg'
            write_gdf(gdf = node_subset_gdf, output_file_path = output_file_path, output_file_name = output_file_name)

            # the missing and known segments for a street
            output_file_name = 'missing_segments_' + '_'.join(sn.split()) + '.gpkg'
            ofpn = os.path.join(output_file_path, output_file_name)                
            output_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)       

    else:
        # gather the streets with no missing segments.         
        col_names = ['snd_id', 'ord_stname_concat', 'snd_group', 'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        output_gdf = temp_gdf[col_names].copy()
        output_gdf['street_status'] = int(0)
        output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

    # add to the output list
    output_gdf_list.append(output_gdf)



8TH AVE S
BELMONT AVE E
E PIKE ST
NW 63RD ST
MCCLINTOCK AVE S
NORTH PARK AVE N
8TH AVE
37TH AVE S
25TH AVE NE
S WAITE ST
NE 107TH ST
NW 42ND ST
LINDEN AVE N
NE 40TH ST
FAUNTLEROY PL SW
W GALER ST
SW WALKER ST
20TH AVE NW
S BYRON ST
S FOREST ST
S FOREST ST 26 25
VICTORY LN NE
18TH AVE SW
26TH AVE NW
29TH AVE S
27TH AVE NE
17TH AVE NW
KIRKWOOD PL N
7TH AVE NW
NE 45TH ST
ROWAN RD S
28TH AVE SW
S 130TH ST
38TH AVE E
E MARION ST
S HOLLY ST
N 95TH ST
E OLIVE ST
E LOUISA ST
E REPUBLICAN ST
15TH AVE SW
CEDAR ST
5TH AVE W
45TH AVE NE
W RAYE ST
S DAKOTA ST
NE 91ST ST
NW 70TH ST
18TH AVE NE
48TH AVE SW
S 102ND ST
W FULTON ST
NE 68TH ST
W RUFFNER ST
35TH AVE
46TH AVE SW
8TH AVE NE
CHERRY ST
21ST AVE S
37TH AVE W
ALKI AVE SW
DAYTON AVE N
SW 102ND ST
N 90TH ST
SW ANDOVER ST
FREMONT AVE N
THOMAS ST
27TH AVE W
WESTERN AVE
SW DAKOTA ST
STONE AVE N
NW 94TH ST
NE 123RD ST
24TH AVE W
NE 130TH PL
NW 62ND ST
48TH AVE S
3RD AVE NE
WESTMONT WAY W
37TH AVE SW
FAIRVIEW AVE E
NE 55TH ST
46TH AVE S
NW 105TH ST
47

In [33]:
# the number of times when the shortest segments
sap_df = pd.DataFrame(data = shortest_added_path, columns = ['ord_stname_concat', 'shortest_added_path', 'n_discontinuities'])

In [34]:
sap_df['n_streets'] = int(1)

In [40]:
sap_df_pv = pd.pivot_table(data = sap_df, values = 'n_streets', index = 'n_discontinuities', columns = 'shortest_added_path', aggfunc = 'sum', fill_value = 0).reset_index(drop = False)

In [48]:
sap_df_ct = pd.crosstab(index = sap_df['n_discontinuities'], columns = sap_df['shortest_added_path'], values = sap_df['n_streets'], aggfunc = 'sum', margins = True, normalize = 'index')

In [49]:
sap_df_ct.head(n=20)

shortest_added_path,False,True
n_discontinuities,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,1.0
2,0.795349,0.204651
3,0.942446,0.057554
4,0.964286,0.035714
5,0.987179,0.012821
6,1.0,0.0
7,1.0,0.0
8,1.0,0.0
9,1.0,0.0
10,1.0,0.0


In [None]:
shortest_street_seg_count

In [None]:
shortest_street_seg_count / sap_df.shape[0]

In [None]:
# stack dataframes

In [None]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [None]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

In [None]:
# join in other street indentification data

In [None]:
ms_gdf = pd.merge(left = ms_gdf, right = gdf[['ord_stname_concat', 'ord_street_type', 'ord_street_name']].drop_duplicates())

In [None]:
ms_gdf.shape

In [None]:
# check for missing values
for cn in ms_gdf.columns:
    print(cn, ms_gdf[cn].isna().unique())    

# add ranking by street type to help with visualization

In [None]:
st_type_df = ms_gdf.loc[ms_gdf['street_status'] == 2, 'ord_street_type'].value_counts().to_frame(name = 'n_segments').reset_index()

In [None]:
st_type_df.head()

In [None]:
st_type_df['segment_rank'] = st_type_df['n_segments'].rank(ascending = True).astype(int).astype(str).str.zfill(2)

In [None]:
st_type_df.head(n=20)

In [None]:
st_type_df['ord_street_type_rank'] = st_type_df['segment_rank'] + '_' + st_type_df['ord_street_type']

In [None]:
st_rank_dict = {ost:ostr for ost, ostr in zip(st_type_df['ord_street_type'], st_type_df['ord_street_type_rank'])}

In [None]:
ms_gdf['ord_street_type_rank'] = ms_gdf['ord_street_type'].map(st_rank_dict)

In [None]:
ms_gdf.loc[ms_gdf['ord_street_type_rank'].isna(), 'ord_street_type_rank'] = ""

In [None]:
# reorder columns
col_names = ['snd_id', 'ord_street_name','ord_stname_concat','ord_street_type', 'ord_street_type_rank', 'snd_group',
             'street_status','sn_id','en_id','dist','dist_miles','geometry']

In [None]:
ms_gdf = ms_gdf[col_names]

# write it out!

In [None]:
output_file_path = 'H:/project/seattle_streets/data'
output_file_name = 'missing_segments.gpkg'
ofpn = os.path.join(output_file_path, output_file_name)    

ms_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)
