# Part 06A: Prep data for export for analysis for v2
michael babb  
2024 11 24

In [24]:
# standard
import os
import re
import collections

In [25]:
# external
import geopandas as gpd
import matplotlib as mpl
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [26]:
# custom
import run_constants as rc
from utils import *

In [27]:
# process flow variables


# load the street network data

In [28]:
rc.S03_SND_WORKING_IN_FILE_NAME

'Street_Network_Database_Seattle_working.gpkg'

In [29]:
# load the list of streets - this is before any work is done to compute the missing sections. 
fpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S03_SND_WORKING_IN_FILE_NAME)

In [30]:
gdf = gpd.read_file(filename = fpn)

In [31]:
gdf.columns

Index(['f_intr_id', 't_intr_id', 'snd_id', 'snd_feacode', 'citycode',
       'stname_id', 'st_code', 'arterial_code', 'segment_type', 'agency_code',
       'access_code', 'divided_code', 'structure_type', 'legalloc_code',
       'vehicle_use_code', 'gis_seg_length', 'l_adrs_from', 'l_adrs_to',
       'r_adrs_from', 'r_adrs_to', 'ord_pre_dir', 'ord_street_name',
       'ord_street_type', 'ord_suf_dir', 'ord_stname_concat', 'l_city',
       'l_state', 'l_zip', 'r_city', 'r_state', 'r_zip', 'sndseg_update',
       'compkey', 'comptype', 'unitid', 'unitid2', 'city_portion', 'geometry'],
      dtype='object')

In [32]:
gdf['n_roads'] = int(1)

In [33]:

gdf['ord_stname_concat'].unique().shape

(2455,)

# group data by street type for use in version 2.0

In [34]:
col_names = ['ord_street_name', 'ord_street_type', 'ord_stname_concat', 'city_portion']
st_name_type_df = gdf[col_names].drop_duplicates()

In [35]:
my_counter = collections.Counter(st_name_type_df['ord_stname_concat'])


In [36]:
my_counter.most_common()

[('UNIVERSITY BR', 2),
 ('AURORA BR', 2),
 ('BALLARD BR', 2),
 ('8TH AVE S', 1),
 ('BELMONT AVE E', 1),
 ('NW 63RD ST', 1),
 ('MCCLINTOCK AVE S', 1),
 ('NORTH PARK AVE N', 1),
 ('8TH AVE', 1),
 ('37TH AVE S', 1),
 ('25TH AVE NE', 1),
 ('S WAITE ST', 1),
 ('NE 107TH ST', 1),
 ('NW 42ND ST', 1),
 ('LINDEN AVE N', 1),
 ('NE 40TH ST', 1),
 ('FAUNTLEROY PL SW', 1),
 ('W GALER ST', 1),
 ('SW WALKER ST', 1),
 ('20TH AVE NW', 1),
 ('S BYRON ST', 1),
 ('S FOREST ST', 1),
 ('VICTORY LN NE', 1),
 ('18TH AVE SW', 1),
 ('26TH AVE NW', 1),
 ('29TH AVE S', 1),
 ('27TH AVE NE', 1),
 ('17TH AVE NW', 1),
 ('KIRKWOOD PL N', 1),
 ('7TH AVE NW', 1),
 ('ROWAN RD S', 1),
 ('28TH AVE SW', 1),
 ('S 130TH ST', 1),
 ('38TH AVE E', 1),
 ('E MARION ST', 1),
 ('S HOLLY ST', 1),
 ('N 95TH ST', 1),
 ('E OLIVE ST', 1),
 ('E LOUISA ST', 1),
 ('E REPUBLICAN ST', 1),
 ('15TH AVE SW', 1),
 ('5TH AVE W', 1),
 ('45TH AVE NE', 1),
 ('W RAYE ST', 1),
 ('S DAKOTA ST', 1),
 ('NE 91ST ST', 1),
 ('NW 70TH ST', 1),
 ('18TH AVE NE'

In [None]:
# the number of unique streets
st_name_type_df.shape

In [19]:
st_name_type_df = st_name_type_df.sort_values(by = ['ord_street_name', 'ord_street_type', 'ord_stname_concat'])

In [None]:
st_name_type_df.head(n=10)

In [None]:
# make some ids - ord_street_name
osn_df = st_name_type_df['ord_street_name'].drop_duplicates().to_frame().sort_values(by = 'ord_street_name').reset_index(drop = True)
osn_df['osn_id'] = range(0, osn_df.shape[0])
osn_df.head()

In [None]:
# ord_street_name and ord_street_type
col_names = ['ord_street_name', 'ord_street_type']
osn_ost_df = st_name_type_df[col_names].drop_duplicates().sort_values(by = col_names).reset_index(drop = True)
osn_ost_df['osn_ost'] = osn_ost_df['ord_street_name'] + ' ' + osn_ost_df['ord_street_type']
osn_ost_df['osn_ost_id'] = range(0, osn_ost_df.shape[0])
osn_ost_df.head()


In [None]:
# ord_stname_concat
ostc_df = st_name_type_df['ord_stname_concat'].drop_duplicates().to_frame().sort_values(by = 'ord_stname_concat').reset_index(drop = True)
ostc_df['ostc_id'] = range(0, ostc_df.shape[0])
ostc_df.head()


In [17]:
# join to get the id values
st_name_type_df = pd.merge(left = st_name_type_df, right = osn_df)
st_name_type_df = pd.merge(left = st_name_type_df, right = osn_ost_df)
st_name_type_df = pd.merge(left = st_name_type_df, right = ostc_df)

In [None]:
st_name_type_df.shape

In [None]:
st_name_type_df.head()

In [None]:
st_name_type_df['ord_street_type'].unique()

In [None]:
st_name_type_df['city_portion'].unique()

In [22]:
check = st_name_type_df.loc[st_name_type_df['city_portion'] == 'WB', ]

In [None]:
check.head()

In [None]:
# let's connect the ord_street_name and the ord_street_type
st_name_type_df['osn_ost'].unique().shape

In [25]:
# LET'S COUNT HOW MANY STREETS WE NEED TO PLOT AND MANUALLY REVIEW

In [26]:
st_name_type_df['n_roads'] = int(1)

In [None]:
st_name_type_df.columns

In [28]:
pv_df = pd.pivot_table(data = st_name_type_df, values = ['n_roads'],
                          index = ['ord_street_name', 'ord_street_type'],
                          columns = ['city_portion'], aggfunc='sum', fill_value=0,
                          margins=True)

In [None]:
pv_df.head()

In [30]:
# clean up column names
pv_df.columns = [cn[1] for cn in pv_df.columns]

In [None]:
pv_df.columns

In [32]:
pv_df = pv_df.reset_index()

In [None]:
pv_df.tail()

In [None]:
pv_df.shape

In [35]:
# grouping by ord_street_name and ord_street_type tells us the number of streets
# there are across the city portions
# we only need to worry about streets that span at least two sections
osn_count_df = pv_df.loc[(pv_df['All'] >= 2), :].copy()

In [36]:
# drop the All row
osn_count_df = osn_count_df.loc[osn_count_df['ord_street_name'] != 'All', :]

In [None]:
osn_count_df.head()

In [38]:
# melt, to reshape
osnt_df = pd.melt(frame = osn_count_df, id_vars = ['ord_street_name', 'ord_street_type'],var_name='city_portion', value_name = 'n_roads')

In [39]:
# drop zero records
osnt_df = osnt_df.loc[(osnt_df['n_roads'] > 0), :].copy()

In [None]:
osnt_df.head()

In [41]:
# the osnt_df is effectively the same shape as the input gdf, but by pivoting we
# can very easily count the number of roads that span multiple sections by using
# built-in tools. 

In [42]:
osnt_df = osnt_df.loc[(osnt_df['city_portion'] != 'All'), :].copy()

In [43]:
# sort it...
osnt_df = osnt_df.sort_values(by = ['ord_street_name', 'ord_street_type', 'city_portion'])

In [44]:
# we don't need this column
osnt_df = osnt_df.drop(labels = ['n_roads'], axis = 1)

In [None]:
osnt_df.head()

In [46]:
# get the numeric value of the streets, for the sort order...
osnt_df['sort_order'] = osnt_df['ord_street_name'].map(get_sort_order)

In [None]:
osnt_df.head()

In [48]:
# reorder
col_names = ['sort_order', 'ord_street_name', 'ord_street_type', 'city_portion']
osnt_df = osnt_df[col_names].sort_values(by = col_names[:4])

In [49]:
# set an index - this is helpful for when we write this to excel 
# easier to read for manual review
osnt_df = osnt_df.set_index(keys = col_names[:4])

In [None]:
osnt_df.head()

In [None]:
osnt_df.shape

In [None]:
osnt_df.columns

In [52]:
# save it to excel
ofpn = os.path.join(rc.ANALYSIS_OUTPUT_FILE_PATH, rc.S06_STREET_GROUP_OUT_FILE_NAME)
osnt_df.to_excel(excel_writer=ofpn)


# PART 2: make a simple plot of each street. 

In [53]:
# each plot will feature the city portion boundaries as created by the convex-hull 
# in step 2 and the different road segments for each street.

In [54]:
# use the same colors for each city portion
cdm = {}
dir_list = ['CNTR', 'E', 'N', 'NE', 'NW', 'S', 'SW', 'W']
for idir, dir_value in enumerate(dir_list):
    cdm[dir_value] = mpl.colormaps["Dark2"].colors[idir]

my_cmap = mpl.colors.ListedColormap([cdm[c] for c in dir_list])

In [55]:
# plot bounds to use across each plot
bounds = [-122.4197794277490061,47.4803548409661005, -122.2200188105690017,47.7341482423694004]

In [56]:
# load the boundaries of the city sections
ifpn = os.path.join(rc.OUTPUT_FILE_PATH, rc.S02_CITY_SECTIONS_OUT_FILE_NAME)
cs_gdf = gpd.read_file(filename=ifpn)

In [57]:
osnt_df = osnt_df.reset_index()

In [None]:
osnt_df.head()

In [None]:
osn_ost_df.shape

In [60]:
wosnt_df = osnt_df[['ord_street_name', 'ord_street_type']].drop_duplicates()

In [None]:
wosnt_df.shape

In [62]:
t_wosnt_df = wosnt_df.iloc[1:10]

In [None]:
t_wosnt_df.head()

In [None]:
wosnt_df.shape

In [None]:
make_plots = True
output_file_name_list = []
if make_plots:
    for irow, row in wosnt_df.iterrows():
        osn = row['ord_street_name']
        ost = row['ord_street_type']


        output_file_name = f"{osn}_{ost}.png"
        ofpn = os.path.join(rc.S06_PLOT_OUTPUT_FILE_PATH, output_file_name)
        print(ofpn)
        output_file_name_list.append(output_file_name)

        # now, let's make a map...
        wgdf = gdf.loc[(gdf['ord_street_name'] == osn) &
                    (gdf['ord_street_type'] == ost), :]
        to_draw = wgdf[['city_portion', 'geometry']].dissolve(by = 'city_portion', as_index = False)
        to_draw['coords'] = to_draw['geometry'].map(lambda x: x.centroid.coords[0])
        fig = plt.figure(layout = 'constrained', figsize = (5, 10))
        gs = GridSpec(1,1, figure = fig, height_ratios = [1])
        ax1 = fig.add_subplot(gs[0,0])
        ax1.set_xlim(bounds[0], bounds[2])
        ax1.set_ylim(bounds[1], bounds[3])
        cs_gdf.plot(ax = ax1, column = 'city_portion', alpha = .2)
        to_draw.plot(ax = ax1, column = 'city_portion', cmap = my_cmap,  linewidth = 5, legend = True)

        for irrow, row in to_draw.iterrows():    
            ax1.annotate(text=row['city_portion'], xy=row['coords'], fontsize = 16 )

        ax1.set_axis_off()

        osn = wgdf['ord_street_name'].unique().tolist()[0]
        ost = wgdf['ord_street_type'].unique().tolist()[0]
        my_title = f"Street Name: {osn} {ost}"
        plt.title(label = my_title)

        #plt.show()

        fig.savefig(fname = ofpn)
        plt.close()

# manually review each plot to identify the groups within each street.

In general, avenues run north-south and streets run east-west. 
Avenues can be grouped by:
* NE, E, CNTR, S  
* N, CNTR, S
* NW, W, SW

Streets can be be grouped by:
* NW, N, NE
* W, N, CNTR, E
* SW, S

# BONUS: is every state name represented as a street?

In [168]:
fpn = os.path.join(rc.INPUT_FILE_PATH, 'state_name.csv')

In [169]:
state_df = pd.read_csv(filepath_or_buffer=fpn)

In [None]:
state_df.head()

In [171]:
# build a set of the state names after cleaning and converting to lower case
state_name_set =  set([sn.strip().lower() for sn in state_df['Name']])

In [None]:
len(state_name_set)

In [173]:
# do the same for the street name after dropping the streets that start with a number.
street_name_set = set(gdf.loc[-gdf['ord_street_name'].str[0].isin(list('0123456789')), 'ord_street_name'].str.lower().unique().tolist())

In [None]:
len(street_name_set)

In [175]:
test_intersection = state_name_set.intersection(street_name_set)

In [None]:
len(test_intersection)

In [177]:
# so, 20 states are representated... 

In [None]:
test_intersection

In [None]:
# what's not represented?
state_name_set.difference(street_name_set)

In [None]:
# what about states with directons in the name and DC?

In [None]:
'dakota' in street_name_set

In [None]:
'columbia' in street_name_set

In [None]:
'virginia' in street_name_set

In [None]:
'carolina' in street_name_set

In [184]:
# if we count Dakota as ND and Sd
# columbia as Washington DC
# Virginia as both VA and West Virginia
# Carolina as both NC and SC
# that brings us to: 26. 
# so, over half. That's kind of cool.