In [1]:
# mike babb
# 2024 06 28
# what streets start and stop?

In [2]:
# standard
import os

In [3]:
# external
from itertools import combinations, product
import geopandas as gpd
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from shapely.geometry import LineString, Point
from shapely import line_merge

In [4]:
# custom
from geodataio.geo_operations import points2distance, calculate_initial_compass_bearing
from utils import *

# load the street network

In [5]:
# file path
input_file_path = 'H:/project/seattle_streets/data/' 
output_file_path = 'H:/project/seattle_streets/data/individual_streets'

In [6]:
file_name = 'Street_Network_Database_Seattle_working.gpkg'

In [7]:
fpn = os.path.join(input_file_path, file_name)

In [8]:
gdf = gpd.read_file(filename = fpn)

In [9]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_portion',
       'ord_st_type', 'snd_group', 'geometry'],
      dtype='object')

# load the node data

In [10]:
input_file_name = 'Street_Network_Nodes.gpkg'

In [11]:
fpn = os.path.join(input_file_path, input_file_name)

In [12]:
node_gdf = gpd.read_file(filename = fpn)

In [13]:
node_gdf.head()

Unnamed: 0,node_id,geometry
0,17740,POINT (-122.32287 47.52982)
1,11391,POINT (-122.32402 47.61849)
2,982,POINT (-122.29193 47.7649)
3,11695,POINT (-122.30782 47.61411)
4,6257,POINT (-122.38214 47.67456)


In [14]:
node_gdf['coords'] = node_gdf['geometry'].map(lambda x: x.coords[0])

In [15]:
node_gdf.head()

Unnamed: 0,node_id,geometry,coords
0,17740,POINT (-122.32287 47.52982),"(-122.32287444824671, 47.529819957875)"
1,11391,POINT (-122.32402 47.61849),"(-122.32401964037119, 47.61848906651073)"
2,982,POINT (-122.29193 47.7649),"(-122.29192836110747, 47.764904310419276)"
3,11695,POINT (-122.30782 47.61411),"(-122.30781531879083, 47.61410595110372)"
4,6257,POINT (-122.38214 47.67456),"(-122.38214200036303, 47.67455801597557)"


In [16]:
# zap this into a dictionary
node_dict = {}
for i, row in node_gdf.iterrows():
    node_dict[row['node_id']] = row['coords']

# COUNT UNIQUE STREET GROUPS

In [17]:
col_names = ['ord_stname_concat', 'ord_st_type', 'snd_group']

gdf_agg = gdf[col_names].drop_duplicates().groupby(col_names[:-1]).agg( snd_group_count = ('snd_group', 'size')).reset_index()

In [18]:
gdf_agg.shape

(2551, 3)

In [19]:
gdf_agg.head()

Unnamed: 0,ord_stname_concat,ord_st_type,snd_group_count
0,10TH AVE,AVE,4
1,10TH AVE E,AVE,4
2,10TH AVE NE,AVE,1
3,10TH AVE NW,AVE,5
4,10TH AVE S,AVE,8


In [20]:
gdf_agg['snd_group_count'].describe()

count    2551.000000
mean        2.444924
std         2.358133
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: snd_group_count, dtype: float64

In [21]:
# number of streets with at least two segments
split_streets = gdf_agg.loc[gdf_agg['snd_group_count'] > 1, :].shape[0]
split_streets

1159

In [22]:
# percent
split_streets / gdf_agg.shape[0]

0.45433163465307724

In [23]:
complete_streets = gdf_agg.loc[gdf_agg['snd_group_count'] == 1, :].shape[0]
complete_streets

1392

In [24]:
# percent
complete_streets / gdf_agg.shape[0]

0.5456683653469228

In [25]:
# 55 percent of streets are single segment
# 45 percent of streets are multi-segments.
# TODO: figure out how much of this is road miles.

In [26]:
# IDENTIFY THE STREET SUFFIX TYPE - IF NOT ALREADY DONE.

In [27]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_portion',
       'ord_st_type', 'snd_group', 'geometry'],
      dtype='object')

In [28]:
gdf['ord_st_type'].isna().unique()

array([False])

# FIND MISSING SEGMENTS

In [29]:
# create a weight variable from the gis_segment_length variable
gdf['weight'] = gdf['gis_seg_length']

In [30]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_portion',
       'ord_st_type', 'snd_group', 'geometry', 'weight'],
      dtype='object')

In [31]:
# the get the list of unique names
s_name = gdf_agg['ord_stname_concat'].unique().tolist()

In [32]:
# let's create another dictionary with the st name and the street type;

In [33]:
st_name_type_dict = {st_name:st_type for st_name, st_type in zip(gdf_agg['ord_stname_concat'], gdf_agg['ord_st_type'])}

In [34]:
gdf_agg['ord_st_type'].unique()

array(['AVE', 'PL', 'RD', 'BR', 'CT', 'WKWY', 'RN', 'LN', 'TER', 'ST',
       'CIR', 'WAY', 'AL', 'RR', 'DR', 'BLVD', 'STCR', 'IS', 'PKWY',
       'MALL', 'PZ', 'FLYOVER', 'SR', 'VIEW', 'LOOP'], dtype=object)

In [None]:
write_intermediate = False
did_it_work = False
output_gdf_list = []


for sn in s_name[:None]:
    print(sn)
    # subset the gdf by streetname
    temp_gdf = gdf.loc[gdf['ord_stname_concat'] == sn, :].copy()

    if write_intermediate:
        output_file_name = '_'.join(sn.split()) + '.gpkg'
        write_gdf(gdf = temp_gdf, output_file_path = output_file_path, output_file_name = output_file_name)

    # create the graph for a single street network
    fg = nx.from_pandas_edgelist(df = temp_gdf, source = 'f_intr_id', target = 't_intr_id', edge_attr=True)
    node_list = list(fg.nodes)

    # a list of the snd groups - this is necessary to 
    # match street segments to nodes
    snd_group_id_list = temp_gdf['snd_group'].unique()
    
    # full nodes - all nodes in the line segments.
    if write_intermediate:
        curr_node_df = pd.DataFrame(data = {'node_id':fg.nodes()})
        curr_node_list = curr_node_df['node_id'].tolist()
        node_subset_gdf = subset_node_gdf(node_gdf = node_gdf, other_node_df = curr_node_df)
        output_file_name = 'full_nodes_' + '_'.join(sn.split()) + '.gpkg'
        write_gdf(gdf = node_subset_gdf, output_file_path = output_file_path, output_file_name = output_file_name)

    
    # for now, only the streets with two or more segments
    if len(snd_group_id_list) > 1:
        node_snd_group_dict = {}
        # a node - an intersection - can have one more streets.
        # we need to update the node snd group dict every time
        for myrow in temp_gdf.itertuples():       
            fn = myrow.f_intr_id
            tn = myrow.t_intr_id
            snd_group_id = myrow.snd_group
            node_snd_group_dict[fn] = snd_group_id
            node_snd_group_dict[tn] = snd_group_id                                             
        
        # step create a list of available edges - these are the missing segments
        # these are formed from the non-edges of the graph.
        # list to hold available edges
        avail_edges = []
        # dictionary to hold available edges
        # let's only make the distance calculation once, yeah?
        # it's a simple calculate, but even more simple to store it. 
        node_dist_dict = {}
        for ne in nx.non_edges(fg):    
            # ne is a tuple of from / to nodes.
            # create available edges if the nodes are not on the same segment
            # this will decrease the potential solution space
            if node_snd_group_dict[ne[0]] != node_snd_group_dict[ne[1]]:
                # calculate the straight-line distance between two nodes. 
                # convert to feet to match the existing weight
                weight = points2distance(node_dict[ne[0]], node_dict[ne[1]], unit = 'miles') * 5280
                # build the output tuple
                output = (ne[0], ne[1], {'weight':weight})
                # add 
                node_dist_dict[(ne[0], ne[1])] = weight 
                node_dist_dict[(ne[1], ne[0])] = weight 
                avail_edges.append(output)

        # these are missing segments.
        # nx.k_edge_augmentation creates the missing edges in a graph by adding as
        # few edges as possible. In a street network, with multiple disconnected
        # components, there is really one way to minimally connect the disparate components
        # to create full connectivity. And it's the shortest geographic segment in this case!
        # weighted graph traversal works by accumulating as little weight as possible. 
        
        data_list = []
        line_list = []
        # once nx.k_edge_augmentation finishes, it returns a generator with the added edges that
        # ensure complete connectivity.
        augmented_edges = nx.k_edge_augmentation(G = fg, k = 1, avail = avail_edges, weight = 'weight')    
        # so, get at the generator
        for i_ae, ae in enumerate(augmented_edges):
            # unpack
            fn, tn = ae            
            # get the weight of the added edge
            weight = node_dist_dict[(fn, tn)] 
            # this is information about the created str strype
            st_type = st_name_type_dict[sn]
            temp_data_list = [sn, st_type,  i_ae, fn, tn, weight]
            # now, let's create some geometry
            temp_line = LineString([node_dict[fn], node_dict[tn]])
            line_list.append(temp_line)
            data_list.append(temp_data_list)

        # build a gpd.GeoDataFrame - these are the "missing" segments"
        ms_gdf = gpd.GeoDataFrame(data = data_list,
                                         columns = ['ord_stname_concat', 'ord_st_type', 'snd_group', 'sn_id', 'en_id', 'dist'],
                                         geometry = line_list, crs = 'epsg:4326')
        ms_gdf['same_snd_group'] = int(0)

        # now, we need to get the known segments
        col_names = ['ord_stname_concat', 'ord_st_type', 'snd_group',  'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        ks_gdf = temp_gdf[col_names].copy()
        ks_gdf['same_snd_group'] = int(1)
        ks_gdf = ks_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})

        # stack the geodataframes
        output_gdf = pd.concat([ms_gdf, ks_gdf])
    
        # write intermediate
        if write_intermediate:
            output_file_name = 'missing_segments_' + '_'.join(sn.split()) + '.gpkg'
            ofpn = os.path.join(output_file_path, output_file_name)                
            output_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)
       

    else:
        # gather the streets with no missing segments. Solid segments        
        col_names = ['ord_stname_concat', 'ord_st_type', 'snd_group', 'f_intr_id', 't_intr_id', 'gis_seg_length', 'geometry']    
        output_gdf = temp_gdf[col_names].copy()
        output_gdf['same_snd_group'] = int(-1)
        output_gdf = output_gdf.rename(columns = {'f_intr_id':'sn_id', 't_intr_id':'en_id', 'gis_seg_length':'dist'})
        
    output_gdf_list.append(output_gdf)
        
did_it_work = True

In [None]:
ms_gdf = pd.concat(objs = output_gdf_list)

In [None]:
ms_gdf['dist_miles'] = ms_gdf['dist'] / 5280

output_file_path = 'H:/project/seattle_streets/data'
output_file_name = 'missing_segments.gpkg'
ofpn = os.path.join(output_file_path, output_file_name)    

ms_gdf.to_file(filename = ofpn, driver = 'GPKG', index = False)

did_it_work

In [None]:
ms_gdf.shape

In [None]:
ms_gdf.head()

In [None]:
# group by and descibe
wms_gdf = ms_gdf.loc[ms_gdf['same_snd_group'] == 0, :].copy()

In [None]:
wms_gdf.groupby('ord_st_type')['dist_miles'].describe()

In [None]:
# LET'S MAKE A GRAPHIC SHOWING THE DISTRIBUTION OF ADDED STREETS

In [None]:
# find the longest
ms_gdf['same_snd_group'].value_counts()

In [None]:
gdf['ord_stname_concat'].unique().shape

# LONGEST MISSING STREETS

In [None]:
wms_gdf['dist_rank'] = wms_gdf['dist'].rank(method = 'dense', ascending = False)

In [None]:
wms_gdf.loc[wms_gdf['dist_rank'] <= 10, 'ord_stname_concat'].tolist()

In [None]:
wms_gdf['dist_miles_log'] = np.log10(wms_gdf['dist_miles'])

In [None]:
wms_gdf['dist_miles_log'].describe()

In [None]:
my_start_list = [100, 250, 500]

for ii in range(1, 5):
    my_calc = int((5280 * (ii / 4)))    
    my_start_list.append(my_calc)    

my_start_list

for ii in range(1, 2):
    for jj in range(0, 5):
        my_calc = int((5280 * (ii / 1)) + (5280 * jj))        
        my_start_list.append(my_calc)    

x_tick_list = [x for x in my_start_list]

In [None]:
x_tick_list

In [None]:
# let's make a graphic showing the distances
sns.set_theme(style = "whitegrid")
f, ax = plt.subplots(figsize = (20, 5))

my_plot = sns.histplot(data = wms_gdf, x = 'dist_miles_log',
                      color='darkgreen', bins = 100)

y_ticks = list(range(0, 176, 25))
y_tick_labels_formatted = ['{:,}'.format(ytl) for ytl in y_ticks]   

my_plot.set_yticks(ticks = y_ticks)
my_plot.set_yticklabels(labels = y_tick_labels_formatted, rotation=0)

plt.title(label = "Histogram of missing segment length")
plt.xlabel(xlabel = 'Missing segment length')
plt.ylabel(ylabel = "Count")

x_tick_labels = [50 / 5280, 100 / 5280, 250 / 5280, 500 / 5280, .25, .5, .75, 1, 1.5, 2, 3]
x_ticks = [np.log10(xtl) for xtl in x_tick_labels]

x_tick_label_text = ['50 F', '100 F', '250 F',  '500 F', '1/4 M', '1/2 M', '3/4 M', '1 M', '1.5 M', '2 M', '3 M']
#x_ticks = [-3, -2, -1, 0, .5]
#x_tick_labels = [10**xt for xt in x_ticks]
#x_tick_labels_formatted = ['{:,}'.format(10**xtl) for xtl in x_ticks]   

my_plot.set_xticks(ticks = x_ticks)
my_plot.set_xticklabels(labels = x_tick_label_text, rotation=0)

output_file_name = '..\\assets\\dist_histogram.png'
output_file_name = os.path.normpath(output_file_name)
print(output_file_name)
my_plot.get_figure().savefig(fname = output_file_name)

In [None]:
temp_wms_gdf = wms_gdf.loc[wms_gdf['ord_st_type'].isin(['AVE', 'ST']), :]

In [None]:
# let's make a graphic showing the distances
sns.set_theme(style = "whitegrid")
f, ax = plt.subplots(figsize = (20, 5))

my_plot = sns.kdeplot(data = temp_wms_gdf, x = 'dist_miles_log', fill = True,
                      hue = 'ord_st_type')

In [None]:
file_name = '../testo.geojson'

In [None]:
tt = temp_wms_gdf.loc[temp_wms_gdf['ord_stname_concat'].str.find('GALER') > 0, :].copy()

In [None]:
tt.to_file(filename = file_name, driver = 'geojson', index = False)

In [None]:
y_ticks = list(range(0, 176, 25))
y_tick_labels_formatted = ['{:,}'.format(ytl) for ytl in y_ticks]   

my_plot.set_yticks(ticks = y_ticks)
my_plot.set_yticklabels(labels = y_tick_labels_formatted, rotation=0)

plt.title(label = "Histogram of missing segment length")
plt.xlabel(xlabel = 'Missing segment length')
plt.ylabel(ylabel = "Count")

x_tick_labels = [50 / 5280, 100 / 5280, 250 / 5280, 500 / 5280, .25, .5, .75, 1, 1.5, 2, 3]
x_ticks = [np.log10(xtl) for xtl in x_tick_labels]

x_tick_label_text = ['50 F', '100 F', '250 F',  '500 F', '1/4 M', '1/2 M', '3/4 M', '1 M', '1.5 M', '2 M', '3 M']
#x_ticks = [-3, -2, -1, 0, .5]
#x_tick_labels = [10**xt for xt in x_ticks]
#x_tick_labels_formatted = ['{:,}'.format(10**xtl) for xtl in x_ticks]   

my_plot.set_xticks(ticks = x_ticks)
my_plot.set_xticklabels(labels = x_tick_label_text, rotation=0)

output_file_name = '..\\assets\\dist_histogram.png'
output_file_name = os.path.normpath(output_file_name)
print(output_file_name)
my_plot.get_figure().savefig(fname = output_file_name)