# Part 03: Merge data and prep to find discontinuities
michael babb  
2025 02 18

In [None]:
# standard
import os

In [None]:
# external
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point

In [None]:
# custom
import run_constants as rc
from utils import *
from geodataio import df_operations as dfo

# load the working seattle street network data

In [None]:
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_SND_WORKING_IN_FILE_NAME)

In [None]:
gdf = gpd.read_file(filename = fpn)

In [None]:
gdf.shape

## load the manually created street group data
These data were created in Step 6, but we'll bring them into the Step 3 workflow.

In [None]:
fpn = os.path.join(rc.INPUT_FILE_PATH, rc.S03_STREET_GROUP_IN_FILE_NAME)

In [None]:
index_cols = ['sort_order', 'ord_street_name', 'ord_street_type', 'city_sector']
sg_df = pd.read_excel(io = fpn, index_col=[0, 1, 2, 3])
sg_df = sg_df.reset_index()

In [None]:
drop_cols = ['sort_order', 'progress', 'count']
sg_df = sg_df.drop(labels = drop_cols, axis = 1)

In [None]:
gdf = pd.merge(left = gdf, right = sg_df, how = 'left', indicator=True)
gdf['group_id'] = gdf['group_id'].fillna(1).astype(int)

In [None]:
gdf.shape

In [None]:
gdf['_merge'].value_counts()

In [None]:
gdf.columns

In [None]:
# TODO: THIS IS WHERE i LEFT OFF: 2025 01 25
hey_what_is_na(gdf)

In [None]:
gdf = gdf.drop(labels = ["_merge"], axis = 1)

# create an id across street groups

In [None]:
col_names = ['ord_street_name', 'ord_street_type',  'ord_stname_concat', 'ord_stname_unique', 'city_sector', 'group_id']
id_df = gdf[col_names].drop_duplicates()

In [None]:
id_df['ord_stname_type'] = id_df['ord_street_name'] + ' ' + id_df['ord_street_type']

In [None]:
id_df['sort_order'] = id_df['ord_street_name'].map(get_sort_order)

In [None]:
id_df = id_df.sort_values(by = ['sort_order', 'ord_street_type', 'city_sector'])

In [None]:
id_df.head()

In [None]:
id_df.shape

In [None]:
# let's get the groups! can we aggregate this?

In [None]:
col_names = ['sort_order', 'ord_street_name', 'ord_street_type', 'ord_stname_type', 'group_id', 'city_sector']
agg_id_df = id_df[col_names].groupby(col_names[:-1]).agg(city_sector_group = ('city_sector', lambda x: '_'.join(sorted(set(x)))),
                                                        n_groups = ('city_sector', lambda x: len(set(x)))).reset_index()

In [None]:
agg_id_df.head()

In [None]:
agg_id_df['city_sector_group'].unique().shape

In [None]:
agg_id_df.loc[agg_id_df['n_groups'] > 1, ].shape

In [None]:
agg_id_df.head()

In [None]:
# import the revised city_sector_group

In [None]:
fpn = os.path.join(rc.INPUT_FILE_PATH, 'city_sector_check_working.xlsx')

In [None]:
cpg_df = pd.read_excel(io = fpn)

In [None]:
# build a row selector. Less code!
row_select = cpg_df['city_sector_group_revised'].isna()

In [None]:
cpg_df.loc[row_select, 'city_sector_group_revised'] = cpg_df.loc[row_select, 'city_sector_group']

In [None]:
cpg_df.head()

In [None]:
agg_id_df.head()

In [None]:
test_join = pd.merge(left = agg_id_df, right = cpg_df, how = 'outer', indicator=True)

In [None]:
test_join['_merge'].value_counts()

In [None]:
test_join.head()

In [None]:
agg_id_df = test_join.drop(labels = ['city_sector_group', '_merge'], axis = 1)

In [None]:
hey_what_is_na(df = agg_id_df)    

In [None]:
test_join = pd.merge(left = gdf, right = agg_id_df, how = 'outer', indicator=True)

In [None]:
test_join['_merge'].value_counts()

In [None]:
test_join['city_sector_group'] = test_join['city_sector_group_revised']

In [None]:
gdf = test_join.drop(labels = ['_merge', 'city_sector_group_revised', 'n_groups'], axis = 1)

In [None]:
gdf.head()

In [None]:
gdf.columns

In [None]:
gdf['ord_stname_type_group'] = gdf['ord_stname_type'] + ' ' + gdf['city_sector_group']

In [None]:
# export to disk
write_gdf(gdf = gdf, output_file_path=rc.OUTPUT_FILE_PATH, output_file_name=rc.S03_CLEANED_STREET_GROUP_OUT_FILE_NAME)