In [1]:
# mike babb
# 2024 06 28
# what streets start and stop?

In [2]:
# standard
import os

In [3]:
# external
from itertools import combinations, product
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from shapely import line_merge
# from skspatial.objects import Point as sks_point

In [4]:
# custom
from geodataio.geo_operations import points2distance, calculate_initial_compass_bearing

# load the street network

In [5]:
# file path
input_file_path = 'H:/project/seattle_streets/data/' 
output_file_path = 'H:/project/seattle_streets/data/individual_streets'

In [6]:
file_name = 'Street_Network_Database.gpkg'

In [7]:
fpn = os.path.join(input_file_path, file_name)

In [8]:
gdf = gpd.read_file(filename = fpn)

In [9]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacod', 'citycode',
       'stname_id', 'st_code', 'arterial_c', 'segment_ty', 'agency_cod',
       'access_cod', 'divided_co', 'structure_', 'legalloc_c', 'vehicle_us',
       'gis_seg_length', 'l_adrs_from', 'l_adrs_to', 'r_adrs_from',
       'r_adrs_to', 'ord_pre_dir', 'ord_street', 'ord_street_type',
       'ord_suf_dir', 'ord_stname', 'l_city', 'l_state', 'l_zip', 'r_city',
       'r_state', 'r_zip', 'sndseg_upd', 'compkey', 'comptype', 'unitid',
       'unitid2', 'city_portion', 'ord_st_type', 'ord_street_st_type',
       'snd_group', 'geometry'],
      dtype='object')

In [10]:
# load the node data
input_file_name = 'Street_Network_Nodes.gpkg'

In [11]:
fpn = os.path.join(input_file_path, input_file_name)

In [12]:
node_gdf = gpd.read_file(filename = fpn)

In [13]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,13827,POINT (-122.40808 47.57771)
1,8721,POINT (-122.33029 47.65195)
2,3606,POINT (-122.28808 47.70834)
3,3784,POINT (-122.27712 47.70534)
4,14373,POINT (-122.40387 47.57132)


In [14]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [15]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,13827,POINT (-122.40808 47.57771),"(-122.40807905399998, 47.577706970000065)"
1,8721,POINT (-122.33029 47.65195),"(-122.33029050099998, 47.651945700000056)"
2,3606,POINT (-122.28808 47.70834),"(-122.28808025199999, 47.70834198600005)"
3,3784,POINT (-122.27712 47.70534),"(-122.27712366699996, 47.70533646400003)"
4,14373,POINT (-122.40387 47.57132),"(-122.40386783599996, 47.57132209200006)"


In [16]:
# zap this into a dictionary
node_dict = {}
for my_row in node_gdf.itertuples():
    node_dict[my_row.node_id] = my_row.coords

In [17]:
node_dict[10272]

(-122.39426507399997, 47.63229296900005)

# PASS 1: FOCUS ON JUST THE STREETS, REGARDLESS OF DIRECTION AND TYPE

In [18]:
gdf_agg = gdf[['ord_stname', 'snd_group']].drop_duplicates().groupby(['ord_stname']).agg( snd_group_count = ('snd_group', 'size')).reset_index()

In [19]:
gdf_agg.head()

Unnamed: 0,ord_stname,snd_group_count
0,10TH AVE,4
1,10TH AVE E,4
2,10TH AVE NE,2
3,10TH AVE NW,5
4,10TH AVE S,8


In [20]:
gdf_agg['snd_group_count'].describe()

count    2510.000000
mean        2.443028
std         2.353508
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: snd_group_count, dtype: float64

In [21]:
# what are the streets with the most breaks?
gdf_agg.loc[gdf_agg['snd_group_count'] == gdf_agg['snd_group_count'].max(), :]

Unnamed: 0,ord_stname,snd_group_count
92,1ST AVE NW,15
203,30TH AVE S,15
253,35TH AVE S,15
2408,W RAYE ST,15


In [22]:
# number of streets with at least two segments
gdf_agg.loc[gdf_agg['snd_group_count'] > 1, :].shape

(1131, 2)

In [23]:
1131 / gdf_agg.shape[0]

0.450597609561753

In [24]:
gdf_agg.loc[gdf_agg['snd_group_count'] == 1, :].shape

(1379, 2)

In [25]:
1379 / gdf_agg.shape[0]

0.549402390438247

In [26]:
# 55 percent of streets are single segment
# 45 percent of streets are multi-segments.
# TODO: figure out how much of this is road miles.

In [27]:
gdf.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacod,citycode,stname_id,st_code,arterial_c,segment_ty,agency_cod,...,sndseg_upd,compkey,comptype,unitid,unitid2,city_portion,ord_st_type,ord_street_st_type,snd_group,geometry
0,13827,13823,26941,1,1,2734,0,0,1,1,...,2005-05-04,22863,68,18205,590,SW,ST,STEVENS ST,0,"LINESTRING (-122.40808 47.57771, -122.40824 47..."
1,8721,8722,11377,1,1,1605,0,0,1,1,...,2005-05-04,15349,68,12005,240,N,ST,37TH ST,7,"LINESTRING (-122.33029 47.65195, -122.32977 47..."
2,3606,3608,6656,5,1,1702,0,1,1,1,...,2005-05-04,16291,68,12865,380,NE,ST,110TH ST,8,"LINESTRING (-122.28808 47.70834, -122.28674 47..."
3,3784,3767,6770,1,1,1697,0,0,1,1,...,2005-05-04,16261,68,12840,480,NE,ST,106TH ST,9,"LINESTRING (-122.27712 47.70534, -122.27604 47..."
4,14373,14456,27361,1,1,459,0,0,1,1,...,2007-04-22,7763,68,4260,370,SW,AVE,56TH AVE,16,"LINESTRING (-122.40387 47.57132, -122.40387 47..."


In [28]:
# get start and end points of each line
#gdf['s_coord'] = gdf['geometry'].map(lambda x: x.coords[0])
#gdf['e_coord'] = gdf['geometry'].map(lambda x: x.coords[-1])

In [29]:
gdf.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacod,citycode,stname_id,st_code,arterial_c,segment_ty,agency_cod,...,sndseg_upd,compkey,comptype,unitid,unitid2,city_portion,ord_st_type,ord_street_st_type,snd_group,geometry
0,13827,13823,26941,1,1,2734,0,0,1,1,...,2005-05-04,22863,68,18205,590,SW,ST,STEVENS ST,0,"LINESTRING (-122.40808 47.57771, -122.40824 47..."
1,8721,8722,11377,1,1,1605,0,0,1,1,...,2005-05-04,15349,68,12005,240,N,ST,37TH ST,7,"LINESTRING (-122.33029 47.65195, -122.32977 47..."
2,3606,3608,6656,5,1,1702,0,1,1,1,...,2005-05-04,16291,68,12865,380,NE,ST,110TH ST,8,"LINESTRING (-122.28808 47.70834, -122.28674 47..."
3,3784,3767,6770,1,1,1697,0,0,1,1,...,2005-05-04,16261,68,12840,480,NE,ST,106TH ST,9,"LINESTRING (-122.27712 47.70534, -122.27604 47..."
4,14373,14456,27361,1,1,459,0,0,1,1,...,2007-04-22,7763,68,4260,370,SW,AVE,56TH AVE,16,"LINESTRING (-122.40387 47.57132, -122.40387 47..."


# BUILD THE ANTI-GRAPH OF DISCONNECTED SEGMENTS

In [30]:
s_names = gdf_agg.loc[gdf_agg['snd_group_count'] > 1, 'ord_stname'].unique().tolist()

In [37]:
s_names = ['W GALER ST']

In [43]:
output_gdf_list = []
# extra explode gdf
ex_gdf_list = []
ex_ex_gdf_list = []
for sn in s_names[:None]:
    # get a street by name
    print(sn)
    # subset the gdf
    temp_gdf = gdf.loc[gdf['ord_stname'] == sn, :].copy()    
    
    
    # build a graph to find (dis)connected components: this is an easy way to find portions of a street that are not connected to each other
    # do I need this graph?
    
    # we can do products of products to find which streets should be compared.
    # the first product to compare is the street group ids

    snd_group_id_list = temp_gdf['snd_group'].unique()
    print(snd_group_id_list)

    output_file_name = '_'.join(sn.split()) + '.gpkg'
    ofpn = os.path.join(output_file_path, output_file_name)
    temp_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

    # hmmm, let's compute the combinations
    # figure out which nodes in a graph only have one successor
    g = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
    n_edge_list = []
    for nn in g.nodes():
        edge_count = len(g[nn])        
        temp_list = [nn, edge_count]
        n_edge_list.append(temp_list)

    n_edge_count_df = pd.DataFrame(data = n_edge_list, columns = ['node_id', 'n_edges'])
    n_edge_count_df = n_edge_count_df.loc[n_edge_count_df['n_edges'] == 1, :]
    # but now, we need to know which node is on which snd_group
    # so, what's the easiest way to do that?
    # let's trim the temp_gdf
    sn_snd_group_id_df = temp_gdf.loc[temp_gdf['f_intr_id'].isin(n_edge_count_df['node_id']), ['f_intr_id', 'snd_group']]
    sn_snd_group_id_df.columns = ['node_id', 'snd_group']
    en_snd_group_id_df = temp_gdf.loc[temp_gdf['t_intr_id'].isin(n_edge_count_df['node_id']), ['t_intr_id', 'snd_group']]
    en_snd_group_id_df.columns = ['node_id', 'snd_group']

    node_snd_group_id_df = pd.concat(objs = [sn_snd_group_id_df, en_snd_group_id_df])

    # this df is also a graph, it's the start and end points of each node.
    # can we do something to make this a litle more easy to query?
    
    
    #print(n_edge_count_df.shape)
    #print(n_edge_count_df.head())
    
    # these are the nodes that are either the start of the line or the end of the line
    # now, we compute some distances
    se_nodes = n_edge_count_df['node_id'].tolist()
    node_dist_list = []
    for sn_id, en_id in combinations(se_nodes, 2):                        
        node_dist = points2distance(node_dict[sn_id], node_dict[en_id], unit = 'miles')
        node_dist_list.append([sn_id, en_id, node_dist])

    node_dist_df = pd.DataFrame(data = node_dist_list, 
                                columns = ['sn_id', 'en_id', 'dist_miles'])
    
    # so, this edge list is the anti-graph. 
    sn_id_agg_df = node_dist_df[['sn_id', 'dist_miles']].groupby(['sn_id']).agg(tot_dist = ('dist_miles', 'sum')).reset_index()
    en_id_agg_df = node_dist_df[['en_id', 'dist_miles']].groupby(['en_id']).agg(tot_dist = ('dist_miles', 'sum')).reset_index()
    # it contains the missing links and then some. What I need to do is find the shortest path that connects everything.
    # let's first start with the node that is the farthest away from everything.

    max_dist_list = [sn_id_agg_df['tot_dist'].max(), en_id_agg_df['tot_dist'].max()]
    max_dist_id = np.argmax(max_dist_list)
    max_dist = np.max(max_dist_list)
    if max_dist_id == 0:
        fp_df = sn_id_agg_df.loc[sn_id_agg_df['tot_dist'] == max_dist, :]
        fp = fp_df['sn_id'].iloc[0]    
    else:
        fp_df = en_id_agg_df.loc[en_id_agg_df['tot_dist'] == max_dist, :]
        fp = fp_df['en_id'].iloc[0]
    
        
    # but what segment is that on?
    # we know it's in group... 



    # okay, so, we need to start traversal.
    # we only need to traverse nodes in the current segment
    #curr_gdf = temp_gdf.loc[temp_gdf['snd_group'] == fp_snd_group, :].copy()
    

    # the farthest point is on which segment?
    #dfs_dict = nx.dfs_successors(G = g, source=farthest_point)
    #for nid in dfs_dict:
    #print(len(testo.keys()))
    #print(testo)
        
        
    
        
        
    
                         
    
        
        


        
        

    
    # i first need to identify the starting and ending nodes of each line so that 
    # I can find the farthest left and right
    # just do the all pairs?
    
    
    # output_dist_df_list = []
    # for cc in combinations(snd_group_id_list, 2):
    #     print(cc)
    #     # start nodes!
    #     s_nodes = temp_gdf.loc[temp_gdf['snd_group'] == cc[0], ['snd_group' 'f_intr_id', 'snd_id']]
    #     # end nodes!
    #     e_nodes = temp_gdf.loc[temp_gdf['snd_group'] == cc[1], ['snd_group' 't_intr_id', 'snd_id']]

    #     # compute distance
    #     for s_row in 

    
    
    

W GALER ST
[162 163 164 165 166]


In [45]:
node_snd_group_id_df.shape

(10, 2)

In [48]:
node_snd_group_id_df = node_snd_group_id_df.sort_values(by = ['snd_group', 'node_id'])

In [49]:
node_snd_group_id_df.head(n=20)

Unnamed: 0,node_id,snd_group
22453,10290,162
26341,10324,162
14288,10284,163
25652,10351,163
15064,10276,164
23542,10277,164
20106,10239,165
20106,10270,165
20519,10361,166
24895,10366,166


In [None]:
np.argmax(a=[1,2])

In [None]:
node_dist_df.head()

In [None]:
sn_id_agg_df = node_dist_df[['sn_id', 'dist_miles']].groupby(['sn_id']).agg(tot_dist = ('dist_miles', 'sum'))

sn_id_agg_df['tot_dist'].max()

en_id_agg_df = node_dist_df[['en_id', 'dist_miles']].groupby(['en_id']).agg(tot_dist = ('dist_miles', 'sum'))

en_id_agg_df['tot_dist'].max()

In [None]:
# TODO: need a portion to flatten each line string, but keep the known vertices