-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding address point workflow code #3
- Loading branch information
Nathanael Rosenheim
committed
Jun 29, 2022
1 parent
4cc81d6
commit fc6a101
Showing
4 changed files
with
665 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
import matplotlib.pyplot as plt | ||
import math as math | ||
import numpy as np | ||
import geopandas as gpd | ||
import pandas as pd | ||
import shapely | ||
import descartes | ||
|
||
import folium as fm # folium has more dynamic maps - but requires internet connection | ||
import sys | ||
import os # For saving output to path | ||
|
||
|
||
def read_in_zip_shapefile_data(geolevel, year, url_list): | ||
# Read data from www2.census.gov | ||
census_url = url_list[geolevel][year] | ||
print(f'Obtaining Census {geolevel} data from:',census_url) | ||
gdf = gpd.read_file(census_url) | ||
|
||
return gdf | ||
|
||
def add_representative_point(gdf,year): | ||
yr = year[2:4] | ||
test_crs = gdf.crs | ||
if test_crs == "EPSG:4269": | ||
# EPSG 4269 uses NAD 83 which will have slightly different lat lon points | ||
# when compared to EPSG 4326 which uses WGS 84. | ||
# Add Representative Point | ||
gdf.loc[gdf.index, f'rppnt{yr}4269'] = gdf['geometry'].representative_point() | ||
gdf[f'rppnt{yr}4269'].label = f"Representative Point {year} EPSG 4269 (WKT)" | ||
gdf[f'rppnt{yr}4269'].notes = f"Internal Point within census block {year} poly EPSG 4269" | ||
|
||
# Add Column that Duplicates Polygon Geometry - | ||
# allows for switching between point and polygon geometries for spatial join | ||
gdf.loc[gdf.index, f'blk{yr}4269'] = gdf['geometry'] | ||
gdf[f'blk{yr}4269'].label = f"{year} Census Block Polygon EPSG 4269 (WKT)" | ||
gdf[f'blk{yr}4269'].notes = f"Polygon Shape Points for {year} Census Block EPSG 4269" | ||
else: | ||
print("Check Census Geography CRS - Expected EPSG 4269") | ||
|
||
return gdf | ||
|
||
def spatial_join_points_to_poly(points_gdf, | ||
poly_gdf, | ||
point_var, | ||
poly_var, | ||
geolevel, | ||
varlist): | ||
""" | ||
Function adds polygon varaibles to block points. | ||
point_var: Variable with WKT Point Geometry for Block Polygon | ||
poly_var: Variable with WKT Polygon Geometry for Block Polygon | ||
geolevel: Geography level of the polygon - example Place or PUMA | ||
varlist: Variable list to be transfere from Polygon File to Block File | ||
""" | ||
|
||
# TO DO - Add CRS Check - Assumes that both block and poly gdf are in the same CRS | ||
|
||
# Find the bounds of the Census Block File | ||
minx = points_gdf.bounds.minx.min() | ||
miny = points_gdf.bounds.miny.min() | ||
maxx = points_gdf.bounds.maxx.max() | ||
maxy = points_gdf.bounds.maxy.max() | ||
points_gdf_bounds = [minx, miny, maxx, maxy] | ||
|
||
# Select polygons within Bounds of Study Area | ||
# build the r-tree index - for Places | ||
sindex_poly_gdf = poly_gdf.sindex | ||
possible_matches_index = list(sindex_poly_gdf.intersection(points_gdf_bounds)) | ||
area_poly_gdf = poly_gdf.iloc[possible_matches_index] | ||
print("Identified",area_poly_gdf.shape[0],geolevel,"polygons to spatially join.") | ||
|
||
# build the r-tree index - Using Representative Point | ||
points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[point_var] | ||
sindex_points_gdf = points_gdf.sindex | ||
|
||
# find the points that intersect with each subpolygon and add ID to Point | ||
for index, poly in area_poly_gdf.iterrows(): | ||
# find approximate matches with r-tree, then precise matches from those approximate ones | ||
possible_matches_index = list(sindex_points_gdf.intersection(poly['geometry'].bounds)) | ||
possible_matches = points_gdf.iloc[possible_matches_index] | ||
precise_matches = possible_matches[possible_matches.intersects(poly['geometry'])] | ||
for var in varlist: | ||
points_gdf.loc[precise_matches.index,geolevel+var] = poly[var] | ||
|
||
# Switch Geometry back to Polygon | ||
points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[poly_var] | ||
|
||
return points_gdf | ||
|
||
def add_address_point_counts(addpt_df, | ||
block_gdf, | ||
merge_id_old, | ||
merge_id): | ||
# Convert blockid Parcel ID to a String | ||
addpt_df[merge_id] = addpt_df[merge_id_old].apply(lambda x : str((x))) | ||
|
||
# Merge Address Point Count with Block Data | ||
block_gdf = pd.merge(block_gdf, addpt_df, | ||
left_on=merge_id, right_on=merge_id, how='left') | ||
|
||
return block_gdf | ||
|
||
def obtain_join_block_place_puma_data(county_fips: str = '48167', | ||
state: str = 'TEXAS', | ||
year: str = '2010', | ||
output_folder: str = 'hua_workflow'): | ||
""" | ||
Function obtains and cleans Census Block Data | ||
Function uses County FIPS Code and URL list to look up Census ZIP Files | ||
Census TIGER ZIP Files include GIS Shapefiles. | ||
county_fips: 5 character string county FIPS code | ||
""" | ||
|
||
# Find State FIPS Code from County FIPS Code | ||
# TO DO - Check that State FIPS code is 2 characters - example California should be '06' | ||
state_fips = county_fips[0:2] | ||
# TO DO - Use US package to get the State name - in all caps | ||
# Find 2 digit year - used in variable names | ||
yr = year[2:4] | ||
|
||
# List of URLS for Census Geography Files | ||
base_url = 'https://www2.census.gov/geo/tiger/' | ||
block_mid_url = f'TIGER2020PL/STATE/{state_fips}_{state}/{county_fips}/tl_2020_' | ||
url_list = \ | ||
{'block' : | ||
{'2010' : f'{base_url}{block_mid_url}{county_fips}_tabblock10.zip', | ||
'2020' : f'{base_url}{block_mid_url}{county_fips}_tabblock20.zip'}, | ||
'place' : | ||
{'2010' : f'{base_url}TIGER2010/PLACE/2010/tl_2010_{state_fips}_place10.zip', | ||
'2020' : f'{base_url}TIGER2020/PLACE/tl_2020_{state_fips}_place.zip'}, | ||
'puma' : | ||
{'2010' : f'{base_url}TIGER2010/PUMA5/2010/tl_2010_{state_fips}_puma10.zip', | ||
'2020' : f'{base_url}TIGER2020/PUMA/tl_2020_{state_fips}_puma10.zip'}} | ||
|
||
# start empty geodataframe dictionary to store geolevel gdfs | ||
gdf = {} | ||
join_cols = {} | ||
geolevels = ['block','place','puma'] | ||
for geolevel in geolevels: | ||
gdf[geolevel] = read_in_zip_shapefile_data(geolevel=geolevel, year = year, url_list = url_list) | ||
join_cols[geolevel] = [col for col in gdf[geolevel] if col.startswith("GEOID")] | ||
join_cols[geolevel] = join_cols[geolevel] + [col for col in gdf[geolevel] if col.startswith("NAME")] | ||
|
||
gdf['block'] = add_representative_point(gdf = gdf['block'],year = year) | ||
|
||
# Add Place Name (Cities) and PUMA ids To Blocks | ||
# Place names provide link to population demographics for cities and places defined by the Census. | ||
# The Census communicates with cities and updates city boundaries based on policitical boundaries set by communities. | ||
|
||
geolevels = ['place','puma'] | ||
for geolevel in geolevels: | ||
print('Spatially Join',geolevel,'Columns',join_cols[geolevel],'with Block Data.') | ||
gdf['block'] = spatial_join_points_to_poly(gdf['block'] ,gdf[geolevel] ,f'rppnt{yr}4269',f'blk{yr}4269',geolevel,join_cols[geolevel]) | ||
|
||
# Save Work at this point as CSV | ||
savefile = sys.path[0]+"/"+output_folder+"/"+f"tl_{year}_{county_fips}_tabblockplacepuma{yr}"+"EPSG4269.csv" | ||
gdf['block'].to_csv(savefile) | ||
|
||
return gdf['block'] | ||
|
||
def single_layer_folium_map(gdf,layer_name, output_folder): | ||
# Find the bounds of the Census Block File | ||
minx = gdf.bounds.minx.min() | ||
miny = gdf.bounds.miny.min() | ||
maxx = gdf.bounds.maxx.max() | ||
maxy = gdf.bounds.maxy.max() | ||
|
||
# plot the intersections and the city | ||
gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10) | ||
#fm.GeoJson(gdf).add_to(gdf_map) | ||
|
||
style_function = lambda x: {'color':'green','fillColor': 'transparent' } | ||
|
||
fm.GeoJson(gdf[['geometry']],name=layer_name,style_function=style_function,).add_to(gdf_map) | ||
|
||
fm.LayerControl().add_to(gdf_map) | ||
|
||
gdf_map.save(output_folder+'/'+layer_name+'.html') | ||
|
||
return gdf_map | ||
|
||
|
||
''' example code | ||
census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167', | ||
state = 'TEXAS', | ||
year = '2010', | ||
output_folder = output_folder) | ||
census_block_place_puma_gdf[['GEOID10','placeGEOID10','pumaGEOID10']].astype(str).describe() | ||
census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167', | ||
state = 'TEXAS', | ||
year = '2020', | ||
output_folder = output_folder) | ||
census_block_place_puma_gdf[['GEOID20','placeGEOID','pumaGEOID10']].astype(str).describe() | ||
map = single_layer_folium_map(census_block_place_puma_gdf,'Census Blocks 2010') | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
|
||
import matplotlib.pyplot as plt | ||
import math as math | ||
import numpy as np | ||
import geopandas as gpd | ||
import pandas as pd | ||
import shapely | ||
import descartes | ||
|
||
import folium as fm # folium has more dynamic maps - but requires internet connection | ||
|
||
import os # For saving output to path | ||
import sys | ||
|
||
from shapely.geometry import Point # Shapely for converting latitude/longitude to geometry | ||
from shapely.wkt import loads | ||
from pyproj import CRS | ||
|
||
''' Code to convert to functions | ||
# Store output files in set folder | ||
output_folder = "hua_workflow" | ||
programname = "IN_CORE_1cv2_Galveston_CleanBuildingInventory_2021-10-20" | ||
## Read in Building Inventory | ||
The building inventory provide basic understanding of where address points can be located. | ||
sys.path[0] | ||
# Galveston Building inventory | ||
filename = 'Nofal_Galveston_Buildings_2021-10-20\\Galveston_Bldgs_Points_Damage.shp' | ||
bldg_inv_gdf = gpd.read_file(filename) | ||
bldg_inv_gdf.head() | ||
# creating a geometry column | ||
geometry = [Point(xy) for xy in zip(bldg_inv_gdf['X_Longit'], bldg_inv_gdf['Y_Latit'])] | ||
# Coordinate reference system : WGS84 | ||
crs = CRS('epsg:4326') | ||
# Creating a Geographic data frame | ||
bldg_inv_gdf = gpd.GeoDataFrame(bldg_inv_gdf, crs=crs, geometry=geometry) | ||
# bldg_inv_gdf[['F_Arch','Bldg_Area']].groupby(['F_Arch']).describe() | ||
# bldg_inv_gdf[['W_Arch','N_Units','Prop_Use','Occupancy']].head() | ||
# pd.pivot_table(bldg_inv_gdf, values='OBJECTID', index=['W_Arch','F_Arch'], | ||
# margins = True, margins_name = 'Total', | ||
# columns=['N_Units','Prop_Use'], aggfunc='count') | ||
## Check Unique Building ID | ||
# Building ID will be important for linking Address Point Inventory to Buildings and Critical Infrastructure Inventories to Buildings. | ||
## ID must be unique and non-missing. | ||
## Count the number of Unique Values | ||
# bldg_inv_gdf.OBJECTID.astype(str).describe() | ||
## Count the number of Unique Values | ||
# bldg_inv_gdf.OBJECTID.nunique() | ||
# bldg_inv_gdf.loc[bldg_inv_gdf['OBJECTID'] == 568208] | ||
## Are there any missing values for the unique id? | ||
# bldg_inv_gdf.loc[bldg_inv_gdf.OBJECTID.isnull()] | ||
# Move Primary Key Column Building ID to first Column | ||
# cols = ['guid'] + [col for col in bldg_inv_gdf if col != 'guid'] | ||
# cols | ||
# bldg_inv_gdf = bldg_inv_gdf[cols] | ||
# bldg_inv_gdf.head() | ||
# Confirm Building ID is Unique and Non-Missing | ||
#bldg_inv_gdf.guid.describe() | ||
## Read in Census Block Data | ||
# Census Blocks provide an estimate of how many residential | ||
# address points (housing units) should be located in each block. | ||
source_file = 'tl_2010_48167_tabblockplacepuma10' | ||
census_blocks_csv = output_folder+"/"+source_file+"EPSG4269.csv" | ||
census_blocks_df = pd.read_csv(census_blocks_csv) | ||
census_blocks_gdf = gpd.GeoDataFrame(census_blocks_df) | ||
census_blocks_gdf.head() | ||
# Use shapely.wkt loads to convert WKT to GeoSeries | ||
census_blocks_gdf['geometry'] = census_blocks_gdf['geometry'].apply(lambda x: loads(x)) | ||
#census_blocks_gdf['geometry'].geom_type.describe() | ||
census_blocks_gdf = census_blocks_gdf.set_geometry(census_blocks_gdf['geometry']) | ||
census_blocks_gdf.crs = CRS('epsg:4269') | ||
#census_blocks_gdf.head() | ||
## Check CRS for Building Centroid and Block | ||
# census_blocks_gdf.crs | ||
# bldg_inv_gdf.crs | ||
# Convert Census Block CRS to Buildings CRS | ||
census_blocks_gdf = census_blocks_gdf.to_crs(bldg_inv_gdf.crs) | ||
#census_blocks_gdf.crs | ||
# Check change in Geometry | ||
census_blocks_gdf['blk104269'] = / | ||
census_blocks_gdf['blk104269'].apply(lambda x: loads(x)) | ||
#census_blocks_gdf[['geometry','blk104269']]./ | ||
# loc[census_blocks_gdf['geometry'] != census_blocks_gdf['blk104269']] | ||
## Convert BLOCKID10 to a string | ||
census_blocks_gdf['BLOCKID10'] = / | ||
census_blocks_gdf['GEOID10'].apply(lambda x : str(int(x))) | ||
## Add State, County, and Census Block ID to Each Footprint | ||
## Select Blocks within Bounding Box of Buildings | ||
# Find the bounds of the Buildings to select Census Blocks | ||
# Add Small Buffer for blocks on the edges | ||
buffer = 0.001 | ||
minx = bldg_inv_gdf.bounds.minx.min() - buffer # subtract buffer from minimum values | ||
miny = bldg_inv_gdf.bounds.miny.min() - buffer # subtract buffer from minimum values | ||
maxx = bldg_inv_gdf.bounds.maxx.max() + buffer | ||
maxy = bldg_inv_gdf.bounds.maxy.max() + buffer | ||
building_gdf_bounds = [minx, miny, maxx, maxy] | ||
building_gdf_bounds | ||
# Select pumas within Bounds of Study Area | ||
# build the r-tree index - for census blocks | ||
sindex_census_blocks_gdf = census_blocks_gdf.sindex | ||
possible_matches_index = list(sindex_census_blocks_gdf.intersection(building_gdf_bounds)) | ||
building_census_blocks_gdf = census_blocks_gdf.iloc[possible_matches_index] | ||
building_census_blocks_gdf['BLOCKID10'].describe() | ||
## Add Census Geography Details to Buildings | ||
# Significant help from: https://geoffboeing.com/2016/10/r-tree-spatial-index-python/ | ||
# Significant help from: https://github.com/gboeing/urban-data-science/blob/master/19-Spatial-Analysis-and-Cartography/rtree-spatial-indexing.ipynb | ||
# build the r-tree index - Using Representative Point | ||
sindex_bldg_inv_gdf = bldg_inv_gdf.sindex | ||
sindex_bldg_inv_gdf | ||
# find the points that intersect with each subpolygon and add ID to Point | ||
for index, block in building_census_blocks_gdf.iterrows(): | ||
if index%100==0: | ||
print('.', end ="") | ||
# find approximate matches with r-tree, then precise matches from those approximate ones | ||
possible_matches_index = list(sindex_bldg_inv_gdf.intersection(block['geometry'].bounds)) | ||
possible_matches = bldg_inv_gdf.iloc[possible_matches_index] | ||
precise_matches = possible_matches[possible_matches.intersects(block['geometry'])] | ||
bldg_inv_gdf.loc[precise_matches.index,'BLOCKID10'] = block['BLOCKID10'] | ||
bldg_inv_gdf.loc[precise_matches.index,'placeGEOID10'] = block['placeGEOID10'] | ||
bldg_inv_gdf.loc[precise_matches.index,'placeNAME10'] = block['placeNAME10'] | ||
# Move Foreign Key Columns Block ID State, County, Tract to first Columns | ||
first_columns = ['OBJECTID','BLOCKID10','placeGEOID10','placeNAME10'] | ||
cols = first_columns + [col for col in bldg_inv_gdf if col not in first_columns] | ||
bldg_inv_gdf = bldg_inv_gdf[cols] | ||
bldg_inv_gdf.head() | ||
### How many buildings do not have block id information? | ||
bldg_noblock_gdf = bldg_inv_gdf.loc[(bldg_inv_gdf['BLOCKID10'].isnull())] | ||
bldg_noblock_gdf | ||
# if there are missing buildings this code will help identify where they are - | ||
# every building should have a block | ||
# plot the building with missing block data | ||
# Find the bounds of the Census Block File | ||
minx = bldg_inv_gdf.bounds.minx.min() | ||
miny = bldg_inv_gdf.bounds.miny.min() | ||
maxx = bldg_inv_gdf.bounds.maxx.max() | ||
maxy = bldg_inv_gdf.bounds.maxy.max() | ||
blockstyle_function = lambda x: {'color':'green','fillColor': 'transparent' } | ||
bldg_inv_gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10) | ||
fm.GeoJson(bldg_noblock_gdf).add_to(bldg_inv_gdf_map) | ||
fm.GeoJson(census_blocks_gdf['geometry'],name='All Census Blocks',style_function=blockstyle_function).add_to(bldg_inv_gdf_map) | ||
fm.GeoJson(building_census_blocks_gdf['geometry'],name='Selected Census Blocks').add_to(bldg_inv_gdf_map) | ||
bldg_inv_gdf_map.save(output_folder + '/' + 'buildings_noblocks.html') | ||
# Error Displaying Map display(neosho_place_gdf_map) | ||
### Save Work | ||
# Check Columns | ||
cols = [col for col in bldg_inv_gdf] | ||
# Save Work at this point as CSV | ||
savefile = sys.path[0]+"/"+output_folder+"/"+programname+"_EPSG4326.csv" | ||
bldg_inv_gdf.to_csv(savefile) | ||
''' |
Oops, something went wrong.