In [1]:
# mike babb
# 2024 06 28
# what streets start and stop?

In [2]:
# standard
import os

In [3]:
# external
from itertools import combinations, product
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from shapely import line_merge

In [4]:
# custom
from geodataio.geo_operations import points2distance, calculate_initial_compass_bearing
from utils import *

# load the working seattle street network data

In [5]:
# file path
input_file_path = 'H:/project/seattle_streets/data/' 
output_file_path = 'H:/project/seattle_streets/data/individual_streets'

In [6]:
file_name = 'Street_Network_Database_Seattle_working.gpkg'

In [7]:
fpn = os.path.join(input_file_path, file_name)

In [8]:
gdf = gpd.read_file(filename = fpn)

# load the node data

In [9]:
input_file_name = 'Street_Network_Nodes.gpkg'

In [10]:
fpn = os.path.join(input_file_path, input_file_name)

In [11]:
node_gdf = gpd.read_file(filename = fpn)

In [12]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,17740,POINT (-122.32287 47.52982)
1,11391,POINT (-122.32402 47.61849)
2,982,POINT (-122.29193 47.7649)
3,11695,POINT (-122.30782 47.61411)
4,6257,POINT (-122.38214 47.67456)


In [13]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [14]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,17740,POINT (-122.32287 47.52982),"(-122.32287444824671, 47.529819957875)"
1,11391,POINT (-122.32402 47.61849),"(-122.32401964037119, 47.61848906651073)"
2,982,POINT (-122.29193 47.7649),"(-122.29192836110747, 47.764904310419276)"
3,11695,POINT (-122.30782 47.61411),"(-122.30781531879083, 47.61410595110372)"
4,6257,POINT (-122.38214 47.67456),"(-122.38214200036303, 47.67455801597557)"


In [15]:
# zap this into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# FIND MISSING SEGMENTS

In [16]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [17]:
# ONLY SELECT COLUMNS NAMES USED IN SUBSEQUENT STEPS

In [18]:
col_names = ['f_intr_id',
't_intr_id',
'snd_id',
'snd_feacode',
'gis_seg_length',
'ord_street_name',
'ord_street_type',
'ord_stname_concat',
'city_portion',
'geometry',
'weight']

In [19]:
gdf = gdf[col_names].copy()

In [37]:
# save intermediate data for checking?
write_intermediate = False

# hold the output
output_gdf_list = []
gdf['snd_group'] = int(0)
snd_group_count = -1
# get the list of unique names
s_name = gdf['ord_stname_concat'].unique().tolist()
#s_name = ['W LAWTON ST']
trouble_list = []
for sn in s_name[:None]:
    #print(sn)
    # subset the gdf by streetname
    temp_gdf = gdf.loc[gdf['ord_stname_concat'] == sn, :].copy()
    
    # create the graph for a single street. For example, all streets W GALER ST are a single graph
    fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
    node_list = list(fg.nodes)
    
    # a list of the snd groups - this is necessary to match street segments to nodes
    # nx.connected_components() identifies disconnected graphs
    for cc in nx.connected_components(G = fg):
        # cc is the sub graph
        # this is the edge data
        edge_data_list = fg.subgraph(cc).edges.data()
        # this marks the street(s) in each sub graph
        snd_id_list = []
        for edl in edge_data_list:                        
            snd_id_list.append(edl[2]['snd_id'])
        snd_group_count += 1
        # this will update the gdf per street name with the groups of streets
        temp_gdf.loc[temp_gdf['snd_id'].isin(snd_id_list), 'snd_group'] = snd_group_count     
    
    snd_group_id_list = temp_gdf['snd_group'].unique()        
    
    if len(snd_group_id_list) > 1:
        # process for separate sub graphs
        node_snd_group_dict = {}
        # a node - an intersection - can have more than one street.
        # we need to update the node snd group dict for every street
        for ir, row in temp_gdf.iterrows():       
            fn = row['f_intr_id']
            tn = row['t_intr_id']
            snd_group_id = row['snd_group']
            node_snd_group_dict[fn] = snd_group_id
            node_snd_group_dict[tn] = snd_group_id                                             
        
        # create a list of available edges - these are the missing segments
        # these are formed from the non-edges of the graph.        
        avail_edges = []
        # dictionary to hold available edges
        # let's only make the distance calculation once, yeah?
        # it's a simple calculation, but even more simple to store it. 
        node_dist_dict = {}
        # enumerate the non-edges
        for ne in nx.non_edges(graph = fg):    
            # ne is a tuple of from / to nodes.
            # create available edges if the nodes are not on the same segment
            # this will decrease the potential solution space
            if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                # calculate the straight-line distance between two nodes. 
                # convert to feet to match the existing distance / weight
                weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                # build the output tuple
                output = (ne[0], ne[1], {'weight':weight})
                # add to the distance dict
                node_dist_dict[(ne[0], ne[1])] = weight 
                node_dist_dict[(ne[1], ne[0])] = weight 
                avail_edges.append(output)

        # these are missing segments.
        # nx.k_edge_augmentation creates the missing edges in a graph by adding as
        # few edges as possible. In a street network, with multiple disconnected
        # components, there is really one way to minimally connect the disparate components
        # to create full connectivity. And it's the shortest geographic segment in this case!
        # weighted graph traversal works by accumulating as little weight as possible. 
        
        data_list = []
        line_list = []
        # once nx.k_edge_augmentation finishes, it returns a generator with the added edges that
        # ensure complete connectivity between all nodes.
        augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
        # enumberat
        for i_ae, ae in enumerate(augmented_edges):
            # unpack
            fn, tn = ae            
            # get the weight / distance of the added edge
            weight = node_dist_dict[(fn, tn)] 
            # this is the output dictionary
            # street name, integer indicating the snd_group, from node, to node, distance of the edge
            temp_data_list = [sn, i_ae, fn, tn, weight]
            # now, let's create some geometry
            temp_line = LineString([node_dict[fn], node_dict[tn]])
            line_list.append(temp_line)
            data_list.append(temp_data_list)

        # build a gpd.GeoDataFrame - these are the "missing" segments"
        ms_gdf = gpd.GeoDataFrame(data = data_list,
                                         columns = ['ord_stname_concat', 'snd_group', 'sn_id', 'en_id', 'dist'],
                                         geometry = line_list, crs = 'epsg:4326')
        # these edges are not on the same street group
        ms_gdf['same_snd_group'] = int(0)
        if ms_gdf.empty:
            trouble_list.append(sn)
            print(sn)
        
        # now, we need to get the known segments
        col_names = ['ord_stname_concat', 'snd_group',  'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        ks_gdf = temp_gdf[col_names].copy()
        ks_gdf['same_snd_group'] = int(1)
        ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # stack the geodataframes
        output_gdf = pd.concat([ms_gdf, ks_gdf])
    
        # write intermediate
        if write_intermediate:
            # intermediate streets
            output_file_name = '_'.join(sn.split()) + '.gpkg'
            write_gdf(gdf = temp_gdf, output_file_path = output_file_path, output_file_name = output_file_name)
        
            # intermediate nodes
            curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
            curr_node_list = curr_node_df['node_id'].tolist()
            node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
            output_file_name = 'full_nodes_' + '_'.join(sn.split()) + '.gpkg'
            write_gdf(gdf = node_subset_gdf, output_file_path = output_file_path, output_file_name = output_file_name)

            # the missing and known segments for a street
            output_file_name = 'missing_segments_' + '_'.join(sn.split()) + '.gpkg'
            ofpn = os.path.join(output_file_path, output_file_name)                
            output_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)       

    else:
        # gather the streets with no missing segments.         
        col_names = ['ord_stname_concat', 'snd_group', 'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        output_gdf = temp_gdf[col_names].copy()
        output_gdf['same_snd_group'] = int(-1)
        output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

    # add to the output list
    output_gdf_list.append(output_gdf)

MERRILL LN NW


  output_gdf = pd.concat([ms_gdf, ks_gdf])


W LAWTON CIR


  output_gdf = pd.concat([ms_gdf, ks_gdf])


7TH AVE N


  output_gdf = pd.concat([ms_gdf, ks_gdf])


SEWARD PARK RD
NE RADFORD DR


  output_gdf = pd.concat([ms_gdf, ks_gdf])
  output_gdf = pd.concat([ms_gdf, ks_gdf])


NE WAHKIAKUM LN


  output_gdf = pd.concat([ms_gdf, ks_gdf])


In [38]:
# stack dataframes

In [39]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [40]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

In [41]:
# join in other street indentification data

In [42]:
ms_gdf = pd.merge(left = ms_gdf, right = gdf[['ord_stname_concat', 'ord_street_type', 'ord_street_name']].drop_duplicates())

In [43]:
ms_gdf.shape

(30524, 10)

In [44]:
for cn in ms_gdf.columns:
    print(cn, ms_gdf[cn].isna().unique())
    

ord_stname_concat [False]
snd_group [False]
sn_id [False]
en_id [False]
dist [False]
geometry [False]
same_snd_group [False]
dist_miles [False]
ord_street_type [False]
ord_street_name [False]


In [45]:
# reorder columns
col_names = ['ord_street_name','ord_stname_concat','ord_street_type','snd_group',
             'same_snd_group','sn_id','en_id','dist','dist_miles','geometry']

In [46]:
ms_gdf = ms_gdf[col_names]

In [47]:
# write it out!

In [48]:
output_file_path = 'H:/project/seattle_streets/data'
output_file_name = 'missing_segments.gpkg'
ofpn = os.path.join(output_file_path, output_file_name)    

ms_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)


In [49]:
# what's causing me grief?

In [50]:
trouble_list

['MERRILL LN NW',
 'W LAWTON CIR',
 '7TH AVE N',
 'SEWARD PARK RD',
 'NE RADFORD DR',
 'NE WAHKIAKUM LN']