# Part 03: Merge data and prep to find discontinuities
michael babb  
2025 02 18

In [1]:
# standard
import os

In [None]:
# external
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point

In [3]:
# custom
import run_constants as rc
from geo_data_io.df_operations import hey_what_is_na
from geo_data_io.fc_df_spatial import write_gdf
from utils import get_sort_order

# load the working seattle street network data

In [4]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_SND_WORKING_IN_FILE_NAME)

In [5]:
gdf = gpd.read_file(filename = fpn)

In [6]:
gdf.shape

(25773, 39)

In [7]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_sector',
       'ord_stname_unique', 'geometry'],
      dtype='object')

In [8]:
gdf['city_sector'].value_counts()

city_sector
S       6602
NE      4426
SW      3896
N       2919
NW      2498
E       1944
W       1833
CNTR    1655
Name: count, dtype: int64

# load the manually created street group data
These data were created in Step 6, but we'll bring them into the Step 3 workflow.

In [9]:
fpn = os.path.join(rc.INPUT_FILE_PATH, rc.S03_STREET_GROUP_IN_FILE_NAME)

In [10]:
index_cols = ['sort_order', 'ord_street_name', 'ord_street_type', 'city_sector']
sg_df = pd.read_excel(io = fpn, index_col=[0, 1, 2, 3])
sg_df = sg_df.reset_index()

In [11]:
sg_df.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,city_sector,group_id,progress,count
0,1,1ST,AVE,CNTR,2,0.000824,1
1,1,1ST,AVE,N,2,0.001647,2
2,1,1ST,AVE,NE,2,0.002471,3
3,1,1ST,AVE,NW,1,0.003295,4
4,1,1ST,AVE,S,2,0.004119,5


In [12]:
sg_df['city_sector'].value_counts()

city_sector
S       223
NE      213
SW      189
NW      151
N       133
CNTR    119
W        96
E        90
Name: count, dtype: int64

In [13]:
drop_cols = ['sort_order', 'progress', 'count']
sg_df = sg_df.drop(labels = drop_cols, axis = 1)

In [14]:
sg_df.loc[
        (sg_df['ord_street_name']=='55TH') &
        (sg_df['ord_street_type'] == 'PL'), :]

Unnamed: 0,ord_street_name,ord_street_type,city_sector,group_id
452,55TH,PL,NE,1
453,55TH,PL,NW,2


In [15]:
sg_df.head()

Unnamed: 0,ord_street_name,ord_street_type,city_sector,group_id
0,1ST,AVE,CNTR,2
1,1ST,AVE,N,2
2,1ST,AVE,NE,2
3,1ST,AVE,NW,1
4,1ST,AVE,S,2


In [16]:
gdf = pd.merge(left = gdf, right = sg_df, how = 'left', indicator=True)
gdf['group_id'] = gdf['group_id'].fillna(1).astype(int)

In [17]:
sg_df['city_sector'].value_counts()

city_sector
S       223
NE      213
SW      189
NW      151
N       133
CNTR    119
W        96
E        90
Name: count, dtype: int64

In [18]:
gdf.shape

(25773, 41)

In [19]:
gdf['_merge'].value_counts()

_merge
both          19032
left_only      6741
right_only        0
Name: count, dtype: int64

In [20]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_sector',
       'ord_stname_unique', 'geometry', 'group_id', '_merge'],
      dtype='object')

# Manually update groups of streets
In Northeast Seattle, there is a 55TH PL NE and NE 55TH PL.  
These are different streets, and not one street with a discontinuity.  
NE 55TH PL runs parallel to NE 55TH ST.  
55TH PL NE is curved, beginning at the northern end of 55TH AVE NE.

In [21]:
tgdf = gdf.loc[
        (gdf['ord_street_name']=='55TH') &
        (gdf['ord_street_type'] == 'PL'), :]

In [22]:
tgdf.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacode,citycode,stname_id,st_code,arterial_code,segment_type,agency_code,...,sndseg_update,compkey,comptype,unitid,unitid2,city_sector,ord_stname_unique,geometry,group_id,_merge
3414,5551,5553,7609,1,1,456,0,0,1,1,...,2007-04-22 00:00:00+00:00,7734,68,4225,750,NE,55TH PL NE,"LINESTRING (-122.26899 47.68334, -122.26899 47...",1,both
5390,6828,6913,9865,5,1,3461,0,1,1,1,...,2005-05-04 00:00:00+00:00,18443,68,14545,550,NW,NW 55TH PL,"LINESTRING (-122.36082 47.66943, -122.36091 47...",2,both
8152,6959,6960,13081,1,1,1864,0,0,1,1,...,2005-05-04 00:00:00+00:00,17020,68,13370,170,NE,NE 55TH PL,"LINESTRING (-122.30887 47.66894, -122.30709 47...",1,both


In [23]:
for cn in tgdf.columns:
    print(cn)
    print(tgdf[cn])

f_intr_id
3414    5551
5390    6828
8152    6959
Name: f_intr_id, dtype: int64
t_intr_id
3414    5553
5390    6913
8152    6960
Name: t_intr_id, dtype: int64
snd_id
3414     7609
5390     9865
8152    13081
Name: snd_id, dtype: int64
snd_feacode
3414    1
5390    5
8152    1
Name: snd_feacode, dtype: int64
citycode
3414    1
5390    1
8152    1
Name: citycode, dtype: int64
stname_id
3414     456
5390    3461
8152    1864
Name: stname_id, dtype: int64
st_code
3414    0
5390    0
8152    0
Name: st_code, dtype: int64
arterial_code
3414    0
5390    1
8152    0
Name: arterial_code, dtype: int64
segment_type
3414    1
5390    1
8152    1
Name: segment_type, dtype: int64
agency_code
3414    1
5390    1
8152    1
Name: agency_code, dtype: int64
access_code
3414    1
5390    1
8152    1
Name: access_code, dtype: int64
divided_code
3414    1
5390    1
8152    1
Name: divided_code, dtype: int64
structure_type
3414    1
5390    1
8152    1
Name: structure_type, dtype: int64
legalloc_code
3414   

In [24]:
# manually assign to a different groups
gdf.loc[gdf['ord_stname_concat'] == 'NE 55TH PL', 'group_id'] = 3

In [25]:
# TODO: THIS IS WHERE i LEFT OFF: 2025 01 25
hey_what_is_na(gdf)

unitid
False    25716
True        57
Name: count, dtype: int64
unitid2
False    25716
True        57
Name: count, dtype: int64


In [26]:
gdf = gdf.drop(labels = ["_merge"], axis = 1)

# create an id across street groups

In [27]:
col_names = ['ord_street_name', 'ord_street_type',  'ord_stname_concat', 'ord_stname_unique', 'city_sector', 'group_id']
id_df = gdf[col_names].drop_duplicates().reset_index(drop=True)

In [28]:
id_df['ord_stname_type'] = id_df['ord_street_name'] + ' ' + id_df['ord_street_type']

In [29]:
id_df['sort_order'] = id_df['ord_street_name'].map(get_sort_order)

In [30]:
id_df = id_df.sort_values(by = ['sort_order', 'ord_street_type', 'city_sector']).reset_index(drop = True)

In [31]:
id_df.head()

Unnamed: 0,ord_street_name,ord_street_type,ord_stname_concat,ord_stname_unique,city_sector,group_id,ord_stname_type,sort_order
0,1ST,AVE,1ST AVE,1ST AVE CNTR,CNTR,2,1ST AVE,1
1,1ST,AVE,1ST AVE N,1ST AVE N,N,2,1ST AVE,1
2,1ST,AVE,1ST AVE NE,1ST AVE NE,NE,2,1ST AVE,1
3,1ST,AVE,1ST AVE NW,1ST AVE NW,NW,1,1ST AVE,1
4,1ST,AVE,1ST AVE S,1ST AVE S,S,2,1ST AVE,1


In [32]:
id_df.shape

(2457, 8)

In [33]:
id_df.loc[
        (id_df['ord_street_name']=='55TH') &
        (id_df['ord_street_type'] == 'PL'), :]

Unnamed: 0,ord_street_name,ord_street_type,ord_stname_concat,ord_stname_unique,city_sector,group_id,ord_stname_type,sort_order
500,55TH,PL,55TH PL NE,55TH PL NE,NE,1,55TH PL,55
501,55TH,PL,NE 55TH PL,NE 55TH PL,NE,3,55TH PL,55
502,55TH,PL,NW 55TH PL,NW 55TH PL,NW,2,55TH PL,55


In [34]:
# let's get the groups! can we aggregate this?

In [35]:
col_names = ['sort_order', 'ord_street_name', 'ord_street_type', 'ord_stname_type', 'group_id', 'city_sector']
agg_id_df = id_df[col_names].groupby(col_names[:-1]).agg(
    city_sector_group = ('city_sector', lambda x: '_'.join(sorted(set(x)))),
    n_groups = ('city_sector', lambda x: len(set(x)))).reset_index()

In [36]:
agg_id_df.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups
0,1,1ST,AVE,1ST AVE,1,NW_SW_W,3
1,1,1ST,AVE,1ST AVE,2,CNTR_N_NE_S,4
2,1,1ST,PL,1ST PL,1,NE,1
3,2,2ND,AVE,2ND AVE,1,NW_SW_W,3
4,2,2ND,AVE,2ND AVE,2,CNTR_N_NE_S,4


In [37]:
agg_id_df.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups
0,1,1ST,AVE,1ST AVE,1,NW_SW_W,3
1,1,1ST,AVE,1ST AVE,2,CNTR_N_NE_S,4
2,1,1ST,PL,1ST PL,1,NE,1
3,2,2ND,AVE,2ND AVE,1,NW_SW_W,3
4,2,2ND,AVE,2ND AVE,2,CNTR_N_NE_S,4


In [38]:
agg_id_df.loc[
        (agg_id_df['ord_street_name']=='55TH') &
        (agg_id_df['ord_street_type'] == 'PL'), :]

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups
244,55,55TH,PL,55TH PL,1,NE,1
245,55,55TH,PL,55TH PL,2,NW,1
246,55,55TH,PL,55TH PL,3,NE,1


# import the revised city_sector_group

In [39]:
fpn = os.path.join(rc.INPUT_FILE_PATH, 'city_sector_check_working.xlsx')

In [40]:
cpg_df = pd.read_excel(io = fpn)

In [41]:
cpg_df.head()

Unnamed: 0,city_sector_group,ord_street_type,n_groups,city_sector_group_revised
0,CNTR,AVE,1,
1,CNTR,BR,1,
2,CNTR,CT,1,
3,CNTR,DR,1,
4,CNTR,PL,1,


In [42]:
cpg_df.loc[cpg_df['ord_street_type'] == 'PL', :]

Unnamed: 0,city_sector_group,ord_street_type,n_groups,city_sector_group_revised
4,CNTR,PL,1,
33,E,PL,1,
48,N,PL,1,
59,NE,PL,1,
72,NW,PL,1,
90,S,PL,1,
102,SW,PL,1,
121,W,PL,1,


In [43]:
# build a row selector. Less code!
row_select = cpg_df['city_sector_group_revised'].isna()

In [44]:
row_select.value_counts()

city_sector_group_revised
True     87
False    39
Name: count, dtype: int64

In [45]:
cpg_df.loc[row_select, 'city_sector_group_revised'] = cpg_df.loc[row_select, 'city_sector_group']

In [46]:
cpg_df.head()

Unnamed: 0,city_sector_group,ord_street_type,n_groups,city_sector_group_revised
0,CNTR,AVE,1,CNTR
1,CNTR,BR,1,CNTR
2,CNTR,CT,1,CNTR
3,CNTR,DR,1,CNTR
4,CNTR,PL,1,CNTR


In [47]:
cpg_df.loc[cpg_df['ord_street_type'] == 'PL']

Unnamed: 0,city_sector_group,ord_street_type,n_groups,city_sector_group_revised
4,CNTR,PL,1,CNTR
33,E,PL,1,E
48,N,PL,1,N
59,NE,PL,1,NE
72,NW,PL,1,NW
90,S,PL,1,S
102,SW,PL,1,SW
121,W,PL,1,W


In [48]:
agg_id_df.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups
0,1,1ST,AVE,1ST AVE,1,NW_SW_W,3
1,1,1ST,AVE,1ST AVE,2,CNTR_N_NE_S,4
2,1,1ST,PL,1ST PL,1,NE,1
3,2,2ND,AVE,2ND AVE,1,NW_SW_W,3
4,2,2ND,AVE,2ND AVE,2,CNTR_N_NE_S,4


In [49]:
cpg_df.loc[cpg_df['ord_street_type'] == 'PL', ]

Unnamed: 0,city_sector_group,ord_street_type,n_groups,city_sector_group_revised
4,CNTR,PL,1,CNTR
33,E,PL,1,E
48,N,PL,1,N
59,NE,PL,1,NE
72,NW,PL,1,NW
90,S,PL,1,S
102,SW,PL,1,SW
121,W,PL,1,W


In [50]:
agg_id_df.loc[agg_id_df['ord_street_type'] == 'PL', 'city_sector_group'].value_counts()

city_sector_group
S       142
NE       95
SW       92
W        47
N        44
NW       42
E        31
CNTR     17
S_SW      1
Name: count, dtype: int64

In [51]:
test_join = pd.merge(left = agg_id_df, right = cpg_df, how = 'outer', indicator=True)

In [52]:
test_join['_merge'].value_counts()

_merge
both          1815
left_only        1
right_only       0
Name: count, dtype: int64

In [53]:
test_join.loc[test_join['_merge'] == 'left_only', :].head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups,city_sector_group_revised,_merge
1109,4,4TH,PL,4TH PL,1,S_SW,2,,left_only


In [54]:
test_join.head()

Unnamed: 0,sort_order,ord_street_name,ord_street_type,ord_stname_type,group_id,city_sector_group,n_groups,city_sector_group_revised,_merge
0,BOREALIS,BOREALIS,AVE,BOREALIS AVE,1,CNTR,1,CNTR,both
1,ERIE,ERIE,AVE,ERIE AVE,1,CNTR,1,CNTR,both
2,EUCLID,EUCLID,AVE,EUCLID AVE,1,CNTR,1,CNTR,both
3,FULLERTON,FULLERTON,AVE,FULLERTON AVE,1,CNTR,1,CNTR,both
4,GRAND,GRAND,AVE,GRAND AVE,1,CNTR,1,CNTR,both


In [55]:
agg_id_df = test_join.drop(labels = ['city_sector_group', '_merge'], axis = 1)

In [56]:
hey_what_is_na(df = agg_id_df)    

city_sector_group_revised
False    1815
True        1
Name: count, dtype: int64


In [57]:
test_join = pd.merge(left = gdf, right = agg_id_df, how = 'outer', indicator=True)

In [58]:
test_join['_merge'].value_counts()

_merge
both          25773
left_only         0
right_only        0
Name: count, dtype: int64

In [59]:
test_join['city_sector_group'] = test_join['city_sector_group_revised']

In [60]:
gdf = test_join.drop(labels = ['_merge', 'city_sector_group_revised', 'n_groups'], axis = 1)

In [61]:
gdf.head()

Unnamed: 0,f_intr_id,t_intr_id,snd_id,snd_feacode,citycode,stname_id,st_code,arterial_code,segment_type,agency_code,...,comptype,unitid,unitid2,city_sector,ord_stname_unique,geometry,group_id,sort_order,ord_stname_type,city_sector_group
0,3836,3893,4787,5,1,1955,0,1,1,1,...,68,14050,40,NW,NW 100TH PL,"LINESTRING (-122.36206 47.7035, -122.36342 47....",1,100,100TH PL,NW
1,3893,3906,4802,5,1,1955,0,1,1,1,...,68,14050,43,NW,NW 100TH PL,"LINESTRING (-122.36342 47.70275, -122.36386 47...",1,100,100TH PL,NW
2,3906,3973,4864,5,1,1955,0,1,1,1,...,68,14050,70,NW,NW 100TH PL,"LINESTRING (-122.36386 47.70251, -122.36535 47...",1,100,100TH PL,NW
3,48444,48443,49871,1,0,3767,0,0,1,1,...,0,0,0,SW,SW 100TH PL,"LINESTRING (-122.3403 47.51334, -122.34036 47....",2,100,100TH PL,SW
4,3943,3939,4836,1,1,1956,0,0,1,1,...,68,14055,240,NW,NW 100TH ST,"LINESTRING (-122.38757 47.70155, -122.38762 47...",1,100,100TH ST,NW_N_NE


In [62]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_sector',
       'ord_stname_unique', 'geometry', 'group_id', 'sort_order',
       'ord_stname_type', 'city_sector_group'],
      dtype='object')

In [63]:
gdf['ord_stname_type_group'] = gdf['ord_stname_type'] + ' ' + gdf['city_sector_group']

In [64]:
# export to disk
write_gdf(gdf = gdf, output_file_path=rc.OUTPUT_FILE_PATH, output_file_name=rc.S03_CLEANED_STREET_GROUP_OUT_FILE_NAME)