Skip to content

Commit

Permalink
adding address point workflow code #3
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathanael Rosenheim committed Jun 29, 2022
1 parent 4cc81d6 commit fc6a101
Show file tree
Hide file tree
Showing 4 changed files with 665 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ All program files (python files) are named with the following convention:
- 04 - Data Visualization
- 05 - Data Output (combined workflow steps for producing csv files)
- 06 - Data Production (codebook, data sharing, archiving, etc.)
- 07 - Data workflows (produce files for other programs)
* Program run order step (e.g run a before b or b depends on code in a)
* version (e.g v2 - version 2 of the file)
- Github used for incremental version control, version number updated when program file has a major change.
Expand Down
201 changes: 201 additions & 0 deletions pyncoda/ncoda_02b_cleanblockdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import matplotlib.pyplot as plt
import math as math
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely
import descartes

import folium as fm # folium has more dynamic maps - but requires internet connection
import sys
import os # For saving output to path


def read_in_zip_shapefile_data(geolevel, year, url_list):
# Read data from www2.census.gov
census_url = url_list[geolevel][year]
print(f'Obtaining Census {geolevel} data from:',census_url)
gdf = gpd.read_file(census_url)

return gdf

def add_representative_point(gdf,year):
yr = year[2:4]
test_crs = gdf.crs
if test_crs == "EPSG:4269":
# EPSG 4269 uses NAD 83 which will have slightly different lat lon points
# when compared to EPSG 4326 which uses WGS 84.
# Add Representative Point
gdf.loc[gdf.index, f'rppnt{yr}4269'] = gdf['geometry'].representative_point()
gdf[f'rppnt{yr}4269'].label = f"Representative Point {year} EPSG 4269 (WKT)"
gdf[f'rppnt{yr}4269'].notes = f"Internal Point within census block {year} poly EPSG 4269"

# Add Column that Duplicates Polygon Geometry -
# allows for switching between point and polygon geometries for spatial join
gdf.loc[gdf.index, f'blk{yr}4269'] = gdf['geometry']
gdf[f'blk{yr}4269'].label = f"{year} Census Block Polygon EPSG 4269 (WKT)"
gdf[f'blk{yr}4269'].notes = f"Polygon Shape Points for {year} Census Block EPSG 4269"
else:
print("Check Census Geography CRS - Expected EPSG 4269")

return gdf

def spatial_join_points_to_poly(points_gdf,
poly_gdf,
point_var,
poly_var,
geolevel,
varlist):
"""
Function adds polygon varaibles to block points.
point_var: Variable with WKT Point Geometry for Block Polygon
poly_var: Variable with WKT Polygon Geometry for Block Polygon
geolevel: Geography level of the polygon - example Place or PUMA
varlist: Variable list to be transfere from Polygon File to Block File
"""

# TO DO - Add CRS Check - Assumes that both block and poly gdf are in the same CRS

# Find the bounds of the Census Block File
minx = points_gdf.bounds.minx.min()
miny = points_gdf.bounds.miny.min()
maxx = points_gdf.bounds.maxx.max()
maxy = points_gdf.bounds.maxy.max()
points_gdf_bounds = [minx, miny, maxx, maxy]

# Select polygons within Bounds of Study Area
# build the r-tree index - for Places
sindex_poly_gdf = poly_gdf.sindex
possible_matches_index = list(sindex_poly_gdf.intersection(points_gdf_bounds))
area_poly_gdf = poly_gdf.iloc[possible_matches_index]
print("Identified",area_poly_gdf.shape[0],geolevel,"polygons to spatially join.")

# build the r-tree index - Using Representative Point
points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[point_var]
sindex_points_gdf = points_gdf.sindex

# find the points that intersect with each subpolygon and add ID to Point
for index, poly in area_poly_gdf.iterrows():
# find approximate matches with r-tree, then precise matches from those approximate ones
possible_matches_index = list(sindex_points_gdf.intersection(poly['geometry'].bounds))
possible_matches = points_gdf.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(poly['geometry'])]
for var in varlist:
points_gdf.loc[precise_matches.index,geolevel+var] = poly[var]

# Switch Geometry back to Polygon
points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[poly_var]

return points_gdf

def add_address_point_counts(addpt_df,
block_gdf,
merge_id_old,
merge_id):
# Convert blockid Parcel ID to a String
addpt_df[merge_id] = addpt_df[merge_id_old].apply(lambda x : str((x)))

# Merge Address Point Count with Block Data
block_gdf = pd.merge(block_gdf, addpt_df,
left_on=merge_id, right_on=merge_id, how='left')

return block_gdf

def obtain_join_block_place_puma_data(county_fips: str = '48167',
state: str = 'TEXAS',
year: str = '2010',
output_folder: str = 'hua_workflow'):
"""
Function obtains and cleans Census Block Data
Function uses County FIPS Code and URL list to look up Census ZIP Files
Census TIGER ZIP Files include GIS Shapefiles.
county_fips: 5 character string county FIPS code
"""

# Find State FIPS Code from County FIPS Code
# TO DO - Check that State FIPS code is 2 characters - example California should be '06'
state_fips = county_fips[0:2]
# TO DO - Use US package to get the State name - in all caps
# Find 2 digit year - used in variable names
yr = year[2:4]

# List of URLS for Census Geography Files
base_url = 'https://www2.census.gov/geo/tiger/'
block_mid_url = f'TIGER2020PL/STATE/{state_fips}_{state}/{county_fips}/tl_2020_'
url_list = \
{'block' :
{'2010' : f'{base_url}{block_mid_url}{county_fips}_tabblock10.zip',
'2020' : f'{base_url}{block_mid_url}{county_fips}_tabblock20.zip'},
'place' :
{'2010' : f'{base_url}TIGER2010/PLACE/2010/tl_2010_{state_fips}_place10.zip',
'2020' : f'{base_url}TIGER2020/PLACE/tl_2020_{state_fips}_place.zip'},
'puma' :
{'2010' : f'{base_url}TIGER2010/PUMA5/2010/tl_2010_{state_fips}_puma10.zip',
'2020' : f'{base_url}TIGER2020/PUMA/tl_2020_{state_fips}_puma10.zip'}}

# start empty geodataframe dictionary to store geolevel gdfs
gdf = {}
join_cols = {}
geolevels = ['block','place','puma']
for geolevel in geolevels:
gdf[geolevel] = read_in_zip_shapefile_data(geolevel=geolevel, year = year, url_list = url_list)
join_cols[geolevel] = [col for col in gdf[geolevel] if col.startswith("GEOID")]
join_cols[geolevel] = join_cols[geolevel] + [col for col in gdf[geolevel] if col.startswith("NAME")]

gdf['block'] = add_representative_point(gdf = gdf['block'],year = year)

# Add Place Name (Cities) and PUMA ids To Blocks
# Place names provide link to population demographics for cities and places defined by the Census.
# The Census communicates with cities and updates city boundaries based on policitical boundaries set by communities.

geolevels = ['place','puma']
for geolevel in geolevels:
print('Spatially Join',geolevel,'Columns',join_cols[geolevel],'with Block Data.')
gdf['block'] = spatial_join_points_to_poly(gdf['block'] ,gdf[geolevel] ,f'rppnt{yr}4269',f'blk{yr}4269',geolevel,join_cols[geolevel])

# Save Work at this point as CSV
savefile = sys.path[0]+"/"+output_folder+"/"+f"tl_{year}_{county_fips}_tabblockplacepuma{yr}"+"EPSG4269.csv"
gdf['block'].to_csv(savefile)

return gdf['block']

def single_layer_folium_map(gdf,layer_name, output_folder):
# Find the bounds of the Census Block File
minx = gdf.bounds.minx.min()
miny = gdf.bounds.miny.min()
maxx = gdf.bounds.maxx.max()
maxy = gdf.bounds.maxy.max()

# plot the intersections and the city
gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10)
#fm.GeoJson(gdf).add_to(gdf_map)

style_function = lambda x: {'color':'green','fillColor': 'transparent' }

fm.GeoJson(gdf[['geometry']],name=layer_name,style_function=style_function,).add_to(gdf_map)

fm.LayerControl().add_to(gdf_map)

gdf_map.save(output_folder+'/'+layer_name+'.html')

return gdf_map


''' example code
census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167',
state = 'TEXAS',
year = '2010',
output_folder = output_folder)
census_block_place_puma_gdf[['GEOID10','placeGEOID10','pumaGEOID10']].astype(str).describe()
census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167',
state = 'TEXAS',
year = '2020',
output_folder = output_folder)
census_block_place_puma_gdf[['GEOID20','placeGEOID','pumaGEOID10']].astype(str).describe()
map = single_layer_folium_map(census_block_place_puma_gdf,'Census Blocks 2010')
'''
173 changes: 173 additions & 0 deletions pyncoda/ncoda_02c_cleanbuildingdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@

import matplotlib.pyplot as plt
import math as math
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely
import descartes

import folium as fm # folium has more dynamic maps - but requires internet connection

import os # For saving output to path
import sys

from shapely.geometry import Point # Shapely for converting latitude/longitude to geometry
from shapely.wkt import loads
from pyproj import CRS

''' Code to convert to functions
# Store output files in set folder
output_folder = "hua_workflow"
programname = "IN_CORE_1cv2_Galveston_CleanBuildingInventory_2021-10-20"
## Read in Building Inventory
The building inventory provide basic understanding of where address points can be located.
sys.path[0]
# Galveston Building inventory
filename = 'Nofal_Galveston_Buildings_2021-10-20\\Galveston_Bldgs_Points_Damage.shp'
bldg_inv_gdf = gpd.read_file(filename)
bldg_inv_gdf.head()
# creating a geometry column
geometry = [Point(xy) for xy in zip(bldg_inv_gdf['X_Longit'], bldg_inv_gdf['Y_Latit'])]
# Coordinate reference system : WGS84
crs = CRS('epsg:4326')
# Creating a Geographic data frame
bldg_inv_gdf = gpd.GeoDataFrame(bldg_inv_gdf, crs=crs, geometry=geometry)
# bldg_inv_gdf[['F_Arch','Bldg_Area']].groupby(['F_Arch']).describe()
# bldg_inv_gdf[['W_Arch','N_Units','Prop_Use','Occupancy']].head()
# pd.pivot_table(bldg_inv_gdf, values='OBJECTID', index=['W_Arch','F_Arch'],
# margins = True, margins_name = 'Total',
# columns=['N_Units','Prop_Use'], aggfunc='count')
## Check Unique Building ID
# Building ID will be important for linking Address Point Inventory to Buildings and Critical Infrastructure Inventories to Buildings.
## ID must be unique and non-missing.
## Count the number of Unique Values
# bldg_inv_gdf.OBJECTID.astype(str).describe()
## Count the number of Unique Values
# bldg_inv_gdf.OBJECTID.nunique()
# bldg_inv_gdf.loc[bldg_inv_gdf['OBJECTID'] == 568208]
## Are there any missing values for the unique id?
# bldg_inv_gdf.loc[bldg_inv_gdf.OBJECTID.isnull()]
# Move Primary Key Column Building ID to first Column
# cols = ['guid'] + [col for col in bldg_inv_gdf if col != 'guid']
# cols
# bldg_inv_gdf = bldg_inv_gdf[cols]
# bldg_inv_gdf.head()
# Confirm Building ID is Unique and Non-Missing
#bldg_inv_gdf.guid.describe()
## Read in Census Block Data
# Census Blocks provide an estimate of how many residential
# address points (housing units) should be located in each block.
source_file = 'tl_2010_48167_tabblockplacepuma10'
census_blocks_csv = output_folder+"/"+source_file+"EPSG4269.csv"
census_blocks_df = pd.read_csv(census_blocks_csv)
census_blocks_gdf = gpd.GeoDataFrame(census_blocks_df)
census_blocks_gdf.head()
# Use shapely.wkt loads to convert WKT to GeoSeries
census_blocks_gdf['geometry'] = census_blocks_gdf['geometry'].apply(lambda x: loads(x))
#census_blocks_gdf['geometry'].geom_type.describe()
census_blocks_gdf = census_blocks_gdf.set_geometry(census_blocks_gdf['geometry'])
census_blocks_gdf.crs = CRS('epsg:4269')
#census_blocks_gdf.head()
## Check CRS for Building Centroid and Block
# census_blocks_gdf.crs
# bldg_inv_gdf.crs
# Convert Census Block CRS to Buildings CRS
census_blocks_gdf = census_blocks_gdf.to_crs(bldg_inv_gdf.crs)
#census_blocks_gdf.crs
# Check change in Geometry
census_blocks_gdf['blk104269'] = /
census_blocks_gdf['blk104269'].apply(lambda x: loads(x))
#census_blocks_gdf[['geometry','blk104269']]./
# loc[census_blocks_gdf['geometry'] != census_blocks_gdf['blk104269']]
## Convert BLOCKID10 to a string
census_blocks_gdf['BLOCKID10'] = /
census_blocks_gdf['GEOID10'].apply(lambda x : str(int(x)))
## Add State, County, and Census Block ID to Each Footprint
## Select Blocks within Bounding Box of Buildings
# Find the bounds of the Buildings to select Census Blocks
# Add Small Buffer for blocks on the edges
buffer = 0.001
minx = bldg_inv_gdf.bounds.minx.min() - buffer # subtract buffer from minimum values
miny = bldg_inv_gdf.bounds.miny.min() - buffer # subtract buffer from minimum values
maxx = bldg_inv_gdf.bounds.maxx.max() + buffer
maxy = bldg_inv_gdf.bounds.maxy.max() + buffer
building_gdf_bounds = [minx, miny, maxx, maxy]
building_gdf_bounds
# Select pumas within Bounds of Study Area
# build the r-tree index - for census blocks
sindex_census_blocks_gdf = census_blocks_gdf.sindex
possible_matches_index = list(sindex_census_blocks_gdf.intersection(building_gdf_bounds))
building_census_blocks_gdf = census_blocks_gdf.iloc[possible_matches_index]
building_census_blocks_gdf['BLOCKID10'].describe()
## Add Census Geography Details to Buildings
# Significant help from: https://geoffboeing.com/2016/10/r-tree-spatial-index-python/
# Significant help from: https://github.com/gboeing/urban-data-science/blob/master/19-Spatial-Analysis-and-Cartography/rtree-spatial-indexing.ipynb
# build the r-tree index - Using Representative Point
sindex_bldg_inv_gdf = bldg_inv_gdf.sindex
sindex_bldg_inv_gdf
# find the points that intersect with each subpolygon and add ID to Point
for index, block in building_census_blocks_gdf.iterrows():
if index%100==0:
print('.', end ="")
# find approximate matches with r-tree, then precise matches from those approximate ones
possible_matches_index = list(sindex_bldg_inv_gdf.intersection(block['geometry'].bounds))
possible_matches = bldg_inv_gdf.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(block['geometry'])]
bldg_inv_gdf.loc[precise_matches.index,'BLOCKID10'] = block['BLOCKID10']
bldg_inv_gdf.loc[precise_matches.index,'placeGEOID10'] = block['placeGEOID10']
bldg_inv_gdf.loc[precise_matches.index,'placeNAME10'] = block['placeNAME10']
# Move Foreign Key Columns Block ID State, County, Tract to first Columns
first_columns = ['OBJECTID','BLOCKID10','placeGEOID10','placeNAME10']
cols = first_columns + [col for col in bldg_inv_gdf if col not in first_columns]
bldg_inv_gdf = bldg_inv_gdf[cols]
bldg_inv_gdf.head()
### How many buildings do not have block id information?
bldg_noblock_gdf = bldg_inv_gdf.loc[(bldg_inv_gdf['BLOCKID10'].isnull())]
bldg_noblock_gdf
# if there are missing buildings this code will help identify where they are -
# every building should have a block
# plot the building with missing block data
# Find the bounds of the Census Block File
minx = bldg_inv_gdf.bounds.minx.min()
miny = bldg_inv_gdf.bounds.miny.min()
maxx = bldg_inv_gdf.bounds.maxx.max()
maxy = bldg_inv_gdf.bounds.maxy.max()
blockstyle_function = lambda x: {'color':'green','fillColor': 'transparent' }
bldg_inv_gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10)
fm.GeoJson(bldg_noblock_gdf).add_to(bldg_inv_gdf_map)
fm.GeoJson(census_blocks_gdf['geometry'],name='All Census Blocks',style_function=blockstyle_function).add_to(bldg_inv_gdf_map)
fm.GeoJson(building_census_blocks_gdf['geometry'],name='Selected Census Blocks').add_to(bldg_inv_gdf_map)
bldg_inv_gdf_map.save(output_folder + '/' + 'buildings_noblocks.html')
# Error Displaying Map display(neosho_place_gdf_map)
### Save Work
# Check Columns
cols = [col for col in bldg_inv_gdf]
# Save Work at this point as CSV
savefile = sys.path[0]+"/"+output_folder+"/"+programname+"_EPSG4326.csv"
bldg_inv_gdf.to_csv(savefile)
'''
Loading

0 comments on commit fc6a101

Please sign in to comment.