In [1]:
# mike babb
# 2024 06 28
# what streets start and stop?

In [2]:
# standard
import os

In [3]:
# external
from itertools import combinations, product
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from shapely import line_merge
# from skspatial.objects import Point as sks_point

In [4]:
# custom
from geodataio.geo_operations import points2distance, calculate_initial_compass_bearing

# load the street network

In [5]:
# file path
input_file_path = 'H:/project/seattle_streets/data/' 
output_file_path = 'H:/project/seattle_streets/data/individual_streets'

In [6]:
file_name = 'Street_Network_Database.gpkg'

In [7]:
fpn = os.path.join(input_file_path, file_name)

In [8]:
gdf = gpd.read_file(filename = fpn)

In [9]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacod', 'citycode',
       'stname_id', 'st_code', 'arterial_c', 'segment_ty', 'agency_cod',
       'access_cod', 'divided_co', 'structure_', 'legalloc_c', 'vehicle_us',
       'gis_seg_length', 'l_adrs_from', 'l_adrs_to', 'r_adrs_from',
       'r_adrs_to', 'ord_pre_dir', 'ord_street', 'ord_street_type',
       'ord_suf_dir', 'ord_stname', 'l_city', 'l_state', 'l_zip', 'r_city',
       'r_state', 'r_zip', 'sndseg_upd', 'compkey', 'comptype', 'unitid',
       'unitid2', 'city_portion', 'ord_st_type', 'ord_street_st_type',
       'snd_group', 'geometry'],
      dtype='object')

In [10]:
# load the node data
input_file_name = 'Street_Network_Nodes.gpkg'

In [11]:
fpn = os.path.join(input_file_path, input_file_name)

In [12]:
node_gdf = gpd.read_file(filename = fpn)

In [13]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,13827,POINT (-122.40808 47.57771)
1,8721,POINT (-122.33029 47.65195)
2,3606,POINT (-122.28808 47.70834)
3,3784,POINT (-122.27712 47.70534)
4,14373,POINT (-122.40387 47.57132)


In [14]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [15]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,13827,POINT (-122.40808 47.57771),"(-122.40807905399998, 47.577706970000065)"
1,8721,POINT (-122.33029 47.65195),"(-122.33029050099998, 47.651945700000056)"
2,3606,POINT (-122.28808 47.70834),"(-122.28808025199999, 47.70834198600005)"
3,3784,POINT (-122.27712 47.70534),"(-122.27712366699996, 47.70533646400003)"
4,14373,POINT (-122.40387 47.57132),"(-122.40386783599996, 47.57132209200006)"


In [16]:
# zap this into a dictionary
node_dict = {}
for my_row in node_gdf.itertuples():
    node_dict[my_row.node_id] = my_row.coords

In [17]:
node_dict[10272]

(-122.39426507399997, 47.63229296900005)

# PASS 1: FOCUS ON JUST THE STREETS, REGARDLESS OF DIRECTION AND TYPE

In [18]:
gdf_agg = gdf[['ord_stname', 'snd_group']].drop_duplicates().groupby(['ord_stname']).agg( snd_group_count = ('snd_group', 'size')).reset_index()

In [19]:
gdf_agg.head()

Unnamed: 0,ord_stname,snd_group_count
0,10TH AVE,4
1,10TH AVE E,4
2,10TH AVE NE,2
3,10TH AVE NW,5
4,10TH AVE S,8


In [20]:
gdf_agg['snd_group_count'].describe()

count    2510.000000
mean        2.443028
std         2.353508
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: snd_group_count, dtype: float64

In [21]:
# what are the streets with the most breaks?
gdf_agg.loc[gdf_agg['snd_group_count'] == gdf_agg['snd_group_count'].max(), :]

Unnamed: 0,ord_stname,snd_group_count
92,1ST AVE NW,15
203,30TH AVE S,15
253,35TH AVE S,15
2408,W RAYE ST,15


In [22]:
# number of streets with at least two segments
gdf_agg.loc[gdf_agg['snd_group_count'] > 1, :].shape

(1131, 2)

In [23]:
1131 / gdf_agg.shape[0]

0.450597609561753

In [24]:
gdf_agg.loc[gdf_agg['snd_group_count'] == 1, :].shape

(1379, 2)

In [25]:
1379 / gdf_agg.shape[0]

0.549402390438247

In [26]:
# 55 percent of streets are single segment
# 45 percent of streets are multi-segments.
# TODO: figure out how much of this is road miles.

In [27]:
# get start and end points of each line
#gdf['s_coord'] = gdf['geometry'].map(lambda x: x.coords[0])
#gdf['e_coord'] = gdf['geometry'].map(lambda x: x.coords[-1])

# BUILD THE ANTI-GRAPH OF DISCONNECTED SEGMENTS

In [28]:
def write_gdf(gdf: gpd.GeoDataFrame, output_file_path:str, output_file_name:str):
    
    ofpn = os.path.join(output_file_path, output_file_name)

    if 'coords' in gdf.columns:
        output_gdf = gdf.drop(labels = ['coords'], axis = 1)
        output_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)
    else:
        gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

    return None

In [29]:
def subset_node_gdf(node_gdf:gpd.GeoDataFrame, other_node_df:pd.DataFrame):
    node_subset_gdf = pd.merge(left = node_gdf, right = other_node_df)
    return node_subset_gdf

In [100]:
def create_graph_count_edges(gdf:gpd.GeoDataFrame, source_node:str, end_node:str):

    # create the graph for a single segment
    g = nx.from_pandas_edgelist(df = gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
    node_list = list(g.nodes)

    # node distance, total hops, and number of edges per node
    node_dist_dict = {nn:0 for nn in node_list}
    node_hop_dict = {nn:0 for nn in node_list}
    # this will get the number of connected edges
    nn_edge_count_dict = {}
    
    # the snd of each node
    node_snd_group_dict = {}    
    snd_group_list = gdf['snd_group'].unique().tolist()
    for snd_group in snd_group_list:
        # use graph travesal to accumulate geographic distance
        curr_node_df = gdf.loc[gdf['snd_group'] == snd_group, ['f_intr_id', 't_intr_id']]
        curr_node_list = set(curr_node_df['f_intr_id'].tolist()).union(curr_node_df['t_intr_id'])                
        
        curr_node_snd_group_dict = {nn:snd_group for nn in curr_node_list}
        
        # combine the two dictionaries
        node_snd_group_dict = node_snd_group_dict | curr_node_snd_group_dict        
        
        # this will get the geographic distance and number of hops
        for sn, en in combinations(curr_node_list, r = 2):
            #print(sn, en)
            sp = nx.shortest_path(G = g, source = sn, target = en, weight = 'gis_seg_length')
            curr_dist = 0
            curr_hop_dist = 0        
            for i_nn, nn in enumerate(sp[:-1]):
                curr_dist += g[nn][sp[i_nn + 1]]['gis_seg_length']
                curr_hop_dist += 1            
            node_dist_dict[sn] += curr_dist
            node_dist_dict[en] += curr_dist
    
            node_hop_dict[sn] += curr_hop_dist
            node_hop_dict[en] += curr_hop_dist                
        
    for nn in node_list:
        nn_edge_count_dict[nn] = len(list(nx.dfs_edges(G = g, source=nn, depth_limit= 1)))
        
        
    # this gets the total geographic distance and total hops
    node_dist_df = pd.DataFrame.from_dict(data = node_dist_dict, orient = 'index', columns = ['tot_dist']).reset_index(names = ['node_id'])
    node_dist_df['tot_hop_dist'] = node_dist_df['node_id'].map(node_hop_dict)
    # number of edges
    node_dist_df['n_edges'] = node_dist_df['node_id'].map(nn_edge_count_dict)
    # add the snd_group
    node_dist_df['snd_group'] = node_dist_df['node_id'].map(node_snd_group_dict)        

    node_dist_df['avg_dist_per_hop'] = node_dist_df['tot_dist'] / node_dist_df['tot_hop_dist']
    # rank it!
    # greater distance indicates nodes further away from others
    node_dist_df['tot_dist_rank'] = node_dist_df.groupby(['snd_group'])['tot_dist'].rank(method = 'dense', ascending = False) 
    node_dist_df['tot_hop_dist_rank'] = node_dist_df.groupby(['snd_group'])['tot_hop_dist'].rank(method = 'dense', ascending = False)    
    # fewer edges indicates start and stop points
    node_dist_df['n_edges_rank'] = node_dist_df.groupby(['snd_group'])['n_edges'].rank(method = 'dense', ascending = True)
    # the rank per average distance - this also indicates greater distance
    node_dist_df['avg_dist_per_hop_rank'] = node_dist_df.groupby(['snd_group'])['avg_dist_per_hop'].rank(method = 'dense', ascending = True)
        
    # the nodes with the lowest combined ranks are the "start" and "end" points of the lines
    node_dist_df['combined_rank'] = node_dist_df['tot_dist_rank'] + node_dist_df['tot_hop_dist_rank'] + node_dist_df['n_edges_rank'] + node_dist_df['avg_dist_per_hop_rank']
    
    node_dist_df['final_rank'] = node_dist_df.groupby(['snd_group'])['combined_rank'].rank(method = 'dense', ascending = True)
    node_dist_df = node_dist_df.sort_values(by = ['snd_group', 'final_rank'], ascending = True)

    # build a dataframe that counts the number of edges per node:
    n_edge_count_df = node_dist_df.loc[node_dist_df['final_rank'] <= 2, :].copy()
    #print(n_edge_count_df.shape)

    node_snd_group_id_df = pd.DataFrame.from_dict(data = node_snd_group_dict, orient = 'index', columns = ['snd_group']).reset_index(names = ['node_id'])            
        
    #print(edge_count_df.head())
    #print(edge_count_df['snd_group'].value_counts())
    # update the dataframes of interest    
    #n_edge_count_df = n_edge_count_df.loc[-n_edge_count_df['node_id'].isin(drop_node_list), :].copy()
    
    snd_group_node_dict = {}
    snd_group_id_list = n_edge_count_df['snd_group'].unique().tolist()
    for snd_group in snd_group_id_list:        
        snd_group_node_dict[snd_group] = n_edge_count_df.loc[n_edge_count_df['snd_group'] == snd_group, 'node_id'].tolist()        

    
    return g, n_edge_count_df, node_dist_df, node_snd_group_id_df, node_snd_group_dict, snd_group_node_dict

In [110]:
sn = '10TH AVE E'

In [111]:
temp_gdf = gdf.loc[gdf['ord_stname'] == sn, :].copy()        

In [112]:
g, n_edge_count_df, node_dist_df, node_snd_group_id_df, node_snd_group_dict, snd_group_node_dict = create_graph_count_edges(gdf = temp_gdf, 
                                                                                                                  source_node = 'f_intr_id', 
                                                                                                                  end_node = 't_intr_id')


In [114]:
n_edge_count_df['snd_group'].value_counts()

snd_group
785    4
788    3
786    2
787    2
Name: count, dtype: int64

In [113]:
n_edge_count_df.head(n=10)

Unnamed: 0,node_id,tot_dist,tot_hop_dist,n_edges,snd_group,avg_dist_per_hop,tot_dist_rank,tot_hop_dist_rank,n_edges_rank,avg_dist_per_hop_rank,combined_rank,final_rank
0,9391,2713.773548,6,1,785,452.295591,2.0,1.0,1.0,2.0,6.0,1.0
5,9083,2720.725284,6,1,785,453.454214,1.0,1.0,1.0,3.0,6.0,1.0
1,9251,1770.910394,4,2,785,442.727598,3.0,2.0,2.0,1.0,8.0,2.0
2,9170,1770.910394,4,2,785,442.727598,3.0,2.0,2.0,1.0,8.0,2.0
3,11355,1278.209282,3,1,786,426.069761,2.0,1.0,1.0,1.0,5.0,1.0
10,11179,1278.318643,3,1,786,426.106214,1.0,1.0,1.0,3.0,6.0,2.0
8,11178,3798.011625,10,1,787,379.801163,2.0,1.0,1.0,1.0,5.0,1.0
11,10869,4015.3573,10,1,787,401.53573,1.0,1.0,1.0,3.0,6.0,2.0
14,9748,28365.374111,58,2,788,489.058174,6.0,3.0,2.0,1.0,12.0,1.0
12,9573,34300.508855,67,2,788,511.947893,4.0,2.0,2.0,6.0,14.0,2.0


In [None]:
# nope, we can use k-edge augmentation with the computed distances to figure this out
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.connectivity.edge_augmentation.k_edge_augmentation.html#networkx.algorithms.connectivity.edge_augmentation.k_edge_augmentation

In [108]:
s_names = gdf_agg.loc[gdf_agg['snd_group_count'] > 1, 'ord_stname'].unique().tolist()
#s_names = ['13TH AVE W']

In [109]:
did_it_work = False
output_gdf_list = []
# extra explode gdf
ex_gdf_list = []
ex_ex_gdf_list = []
for sn in s_names[:200]:
    # get a street by name
    print(sn)
    # subset the gdf
    temp_gdf = gdf.loc[gdf['ord_stname'] == sn, :].copy()        
    
    # build a graph to find (dis)connected components: this is an easy way to find portions of a street that are not connected to each other
    # do I need this graph?
    
    # we can do products of products to find which streets should be compared.
    # the first product to compare is the street group ids
        
    snd_group_id_list = temp_gdf['snd_group'].unique()
    #print(snd_group_id_list)
    output_file_name = '_'.join(sn.split()) + '.gpkg'
    write_gdf(gdf = temp_gdf, output_file_path = output_file_path, output_file_name = output_file_name)
    
    # hmmm, let's compute the combinations
    # figure out which nodes in a graph only have one successor
    g, n_edge_count_df, node_dist_df, node_snd_group_id_df, node_snd_group_dict, snd_group_node_dict = create_graph_count_edges(gdf = temp_gdf, 
                                                                                                                  source_node = 'f_intr_id', 
                                                                                                                  end_node = 't_intr_id')


    #n_edge_count_df = n_edge_count_df.loc[n_edge_count_df['tot_dist_rank'] <= 2, :]
    #print(

    # subset nodes
    node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = n_edge_count_df)
    output_file_name = 'subset_nodes_' + '_'.join(sn.split()) + '.gpkg'
    write_gdf(gdf = node_subset_gdf, output_file_path = output_file_path, output_file_name = output_file_name)

    # full nodes
    curr_node_df = pd.DataFrame(data = {'node_id':g.nodes()})
    node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
    output_file_name = 'full_nodes_' + '_'.join(sn.split()) + '.gpkg'
    write_gdf(gdf = node_subset_gdf, output_file_path = output_file_path, output_file_name = output_file_name)
    
         
    edge_list = []
    for snd_group in n_edge_count_df['snd_group'].unique().tolist():
        nodes = n_edge_count_df.loc[n_edge_count_df['snd_group']==snd_group, 'node_id'].tolist()
        weight = points2distance(node_dict[nodes[0]], node_dict[nodes[1]], unit = 'miles')
        nodes.append(weight)
        edge_list.append(nodes)
        
    
    edge_df = pd.DataFrame(data = edge_list, columns = ['sn_id', 'en_id', 'weight'])
    
    sg = nx.from_pandas_edgelist(df = edge_df, source = 'sn_id', target = 'en_id', edge_attr=True)
    
    avail_edges = []
    for ne in nx.non_edges(sg):    
        weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles')
        output = (ne[0], ne[1], {'weight':weight})
        avail_edges.append(output)
        
    data_list = []
    line_list = []
    augmented_edges = nx.k_edge_augmentation(G = sg, k = 1, avail = avail_edges, weight = 'weight')    
    for i_ne, ne in enumerate(augmented_edges):
        temp_data_list = [sn, i_ne, ne[0], ne[1]]
        temp_line = LineString([node_dict[ne[0]], node_dict[ne[1]]])
        line_list.append(temp_line)
        data_list.append(temp_data_list)

    my_output_gdf = gpd.GeoDataFrame(data = data_list,
                                     columns = ['ord_stname', 'as_id', 'sn_id', 'en_id'],
                                     geometry = line_list, crs = 'epsg:4326')
    
    my_output_gdf['same_snd_group'] = int(0)
    #my_output_gdf.loc[my_output_gdf['sn_snd_group_id'] == my_output_gdf['en_snd_group_id'], 'same_snd_group'] = 1 
    #print(my_output_gdf.head())
    
    
    output_file_name = 'missing_segments_' + '_'.join(sn.split()) + '.gpkg'
    ofpn = os.path.join(output_file_path, output_file_name)    
    
    my_output_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

    output_gdf_list.append(my_output_gdf)
    did_it_work = True

10TH AVE
10TH AVE E


ValueError: 3 columns passed, passed data had 5 columns

In [None]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [None]:
output_file_name = 'missing_segments.gpkg'
ofpn = os.path.join(output_file_path, output_file_name)    

ms_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

In [None]:
did_it_work

In [None]:
node_snd_group_id_df.shape

In [None]:
node_snd_group_id_df = node_snd_group_id_df.sort_values(by = ['snd_group', 'node_id'])

In [None]:
node_snd_group_id_df.head(n=20)

In [None]:
np.argmax(a=[1,2])

In [None]:
node_dist_df.head()

In [None]:
sn_id_agg_df = node_dist_df[['sn_id', 'dist_miles']].groupby(['sn_id']).agg(tot_dist = ('dist_miles', 'sum'))

sn_id_agg_df['tot_dist'].max()

en_id_agg_df = node_dist_df[['en_id', 'dist_miles']].groupby(['en_id']).agg(tot_dist = ('dist_miles', 'sum'))

en_id_agg_df['tot_dist'].max()

In [None]:
# TODO: need a portion to flatten each line string, but keep the known vertices