adding address point workflow code #3

npr99 · Jun 29, 2022 · fc6a101 · fc6a101
1 parent 4cc81d6
commit fc6a101
Show file tree

Hide file tree

Showing 4 changed files with 665 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -78,6 +78,7 @@ All program files (python files) are named with the following convention:
     - 04 - Data Visualization
     - 05 - Data Output (combined workflow steps for producing csv files)
     - 06 - Data Production (codebook, data sharing, archiving, etc.)
+    - 07 - Data workflows (produce files for other programs)
 * Program run order step (e.g run a before b or b depends on code in a)
 * version  (e.g v2 - version 2 of the file)
     - Github used for incremental version control, version number updated when program file has a major change.

diff --git a/pyncoda/ncoda_02b_cleanblockdata.py b/pyncoda/ncoda_02b_cleanblockdata.py
@@ -0,0 +1,201 @@
+import matplotlib.pyplot as plt
+import math as math
+import numpy as np
+import geopandas as gpd
+import pandas as pd
+import shapely
+import descartes
+
+import folium as fm # folium has more dynamic maps - but requires internet connection
+import sys
+import os # For saving output to path
+
+
+def read_in_zip_shapefile_data(geolevel, year, url_list):
+    # Read data from www2.census.gov 
+    census_url = url_list[geolevel][year]
+    print(f'Obtaining Census {geolevel} data from:',census_url)
+    gdf = gpd.read_file(census_url)
+
+    return gdf
+
+def add_representative_point(gdf,year):
+    yr = year[2:4]
+    test_crs = gdf.crs
+    if test_crs == "EPSG:4269":
+        # EPSG 4269 uses NAD 83 which will have slightly different lat lon points 
+        # when compared to EPSG 4326 which uses WGS 84.
+        # Add Representative Point
+        gdf.loc[gdf.index, f'rppnt{yr}4269'] = gdf['geometry'].representative_point()
+        gdf[f'rppnt{yr}4269'].label = f"Representative Point {year} EPSG 4269 (WKT)"
+        gdf[f'rppnt{yr}4269'].notes = f"Internal Point within census block {year} poly EPSG 4269"
+
+        # Add Column that Duplicates Polygon Geometry - 
+        # allows for switching between point and polygon geometries for spatial join
+        gdf.loc[gdf.index, f'blk{yr}4269'] = gdf['geometry']
+        gdf[f'blk{yr}4269'].label = f"{year} Census Block Polygon EPSG 4269 (WKT)"
+        gdf[f'blk{yr}4269'].notes = f"Polygon Shape Points for {year} Census Block EPSG 4269"
+    else:
+        print("Check Census Geography CRS - Expected EPSG 4269")
+
+    return gdf
+
+def spatial_join_points_to_poly(points_gdf, 
+                                poly_gdf, 
+                                point_var, 
+                                poly_var, 
+                                geolevel, 
+                                varlist):
+    """
+    Function adds polygon varaibles to block points.
+    point_var: Variable with WKT Point Geometry for Block Polygon
+    poly_var: Variable with WKT Polygon Geometry for Block Polygon
+    geolevel: Geography level of the polygon - example Place or PUMA
+    varlist: Variable list to be transfere from Polygon File to Block File
+    """
+
+    # TO DO - Add CRS Check - Assumes that both block and poly gdf are in the same CRS
+
+    # Find the bounds of the Census Block File
+    minx = points_gdf.bounds.minx.min()
+    miny = points_gdf.bounds.miny.min()
+    maxx = points_gdf.bounds.maxx.max()
+    maxy = points_gdf.bounds.maxy.max()
+    points_gdf_bounds = [minx, miny, maxx, maxy]
+
+    # Select polygons within Bounds of Study Area
+    # build the r-tree index - for Places
+    sindex_poly_gdf = poly_gdf.sindex
+    possible_matches_index = list(sindex_poly_gdf.intersection(points_gdf_bounds))
+    area_poly_gdf = poly_gdf.iloc[possible_matches_index]
+    print("Identified",area_poly_gdf.shape[0],geolevel,"polygons to spatially join.")
+
+    # build the r-tree index - Using Representative Point
+    points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[point_var]
+    sindex_points_gdf = points_gdf.sindex
+
+    # find the points that intersect with each subpolygon and add ID to Point
+    for index, poly in area_poly_gdf.iterrows():
+        # find approximate matches with r-tree, then precise matches from those approximate ones
+        possible_matches_index = list(sindex_points_gdf.intersection(poly['geometry'].bounds))
+        possible_matches = points_gdf.iloc[possible_matches_index]
+        precise_matches = possible_matches[possible_matches.intersects(poly['geometry'])]
+        for var in varlist:
+            points_gdf.loc[precise_matches.index,geolevel+var] = poly[var]
+
+    # Switch Geometry back to Polygon
+    points_gdf.loc[points_gdf.index,'geometry'] = points_gdf[poly_var]
+
+    return points_gdf
+
+def add_address_point_counts(addpt_df, 
+                            block_gdf, 
+                            merge_id_old, 
+                            merge_id):
+    # Convert blockid Parcel ID to a String
+    addpt_df[merge_id] = addpt_df[merge_id_old].apply(lambda x : str((x)))
+
+    # Merge Address Point Count with Block Data 
+    block_gdf = pd.merge(block_gdf, addpt_df,
+                        left_on=merge_id, right_on=merge_id, how='left')
+
+    return block_gdf
+
+def obtain_join_block_place_puma_data(county_fips: str = '48167',
+                            state: str = 'TEXAS',
+                            year: str = '2010',
+                            output_folder: str = 'hua_workflow'):
+    """
+    Function obtains and cleans Census Block Data
+    Function uses County FIPS Code and URL list to look up Census ZIP Files
+    Census TIGER ZIP Files include GIS Shapefiles.
+
+    county_fips: 5 character string county FIPS code
+    """
+
+    # Find State FIPS Code from County FIPS Code
+    # TO DO - Check that State FIPS code is 2 characters - example California should be '06'
+    state_fips = county_fips[0:2]
+    # TO DO - Use US package to get the State name - in all caps
+    # Find 2 digit year - used in variable names
+    yr = year[2:4]
+
+    # List of URLS for Census Geography Files
+    base_url = 'https://www2.census.gov/geo/tiger/'
+    block_mid_url = f'TIGER2020PL/STATE/{state_fips}_{state}/{county_fips}/tl_2020_'
+    url_list = \
+    {'block' : 
+        {'2010' : f'{base_url}{block_mid_url}{county_fips}_tabblock10.zip',
+        '2020' : f'{base_url}{block_mid_url}{county_fips}_tabblock20.zip'},
+    'place' :
+        {'2010' : f'{base_url}TIGER2010/PLACE/2010/tl_2010_{state_fips}_place10.zip',
+        '2020' : f'{base_url}TIGER2020/PLACE/tl_2020_{state_fips}_place.zip'},
+    'puma'  : 
+        {'2010' : f'{base_url}TIGER2010/PUMA5/2010/tl_2010_{state_fips}_puma10.zip',
+        '2020' : f'{base_url}TIGER2020/PUMA/tl_2020_{state_fips}_puma10.zip'}}
+
+    # start empty geodataframe dictionary to store geolevel gdfs
+    gdf = {}
+    join_cols = {}
+    geolevels = ['block','place','puma']
+    for geolevel in geolevels:
+        gdf[geolevel] = read_in_zip_shapefile_data(geolevel=geolevel, year = year, url_list = url_list)
+        join_cols[geolevel] =  [col for col in gdf[geolevel] if col.startswith("GEOID")]
+        join_cols[geolevel] =  join_cols[geolevel] + [col for col in gdf[geolevel] if col.startswith("NAME")]
+
+    gdf['block'] = add_representative_point(gdf = gdf['block'],year = year)
+
+    # Add Place Name (Cities) and PUMA ids To Blocks
+    # Place names provide link to population demographics for cities and places defined by the Census. 
+    # The Census communicates with cities and updates city boundaries based on policitical boundaries set by communities.
+
+    geolevels = ['place','puma']
+    for geolevel in geolevels:
+        print('Spatially Join',geolevel,'Columns',join_cols[geolevel],'with Block Data.')
+        gdf['block'] = spatial_join_points_to_poly(gdf['block'] ,gdf[geolevel] ,f'rppnt{yr}4269',f'blk{yr}4269',geolevel,join_cols[geolevel])
+
+    # Save Work at this point as CSV
+    savefile = sys.path[0]+"/"+output_folder+"/"+f"tl_{year}_{county_fips}_tabblockplacepuma{yr}"+"EPSG4269.csv"
+    gdf['block'].to_csv(savefile)
+
+    return gdf['block']
+
+def single_layer_folium_map(gdf,layer_name, output_folder):   
+    # Find the bounds of the Census Block File
+    minx = gdf.bounds.minx.min()
+    miny = gdf.bounds.miny.min()
+    maxx = gdf.bounds.maxx.max()
+    maxy = gdf.bounds.maxy.max()
+
+    # plot the intersections and the city
+    gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10)
+    #fm.GeoJson(gdf).add_to(gdf_map)
+
+    style_function = lambda x: {'color':'green','fillColor': 'transparent' }
+
+    fm.GeoJson(gdf[['geometry']],name=layer_name,style_function=style_function,).add_to(gdf_map)
+
+    fm.LayerControl().add_to(gdf_map)
+
+    gdf_map.save(output_folder+'/'+layer_name+'.html')
+
+    return gdf_map
+
+
+''' example code
+
+census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167',
+                            state = 'TEXAS',
+                            year = '2010',
+                            output_folder = output_folder)
+census_block_place_puma_gdf[['GEOID10','placeGEOID10','pumaGEOID10']].astype(str).describe()
+
+census_block_place_puma_gdf = obtain_join_block_place_puma_data(county_fips = '48167',
+                            state = 'TEXAS',
+                            year = '2020',
+                            output_folder = output_folder)
+
+census_block_place_puma_gdf[['GEOID20','placeGEOID','pumaGEOID10']].astype(str).describe()
+
+map = single_layer_folium_map(census_block_place_puma_gdf,'Census Blocks 2010')
+'''
diff --git a/pyncoda/ncoda_02c_cleanbuildingdata.py b/pyncoda/ncoda_02c_cleanbuildingdata.py
@@ -0,0 +1,173 @@
+
+import matplotlib.pyplot as plt
+import math as math
+import numpy as np
+import geopandas as gpd
+import pandas as pd
+import shapely
+import descartes
+
+import folium as fm # folium has more dynamic maps - but requires internet connection
+
+import os # For saving output to path
+import sys
+
+from shapely.geometry import Point # Shapely for converting latitude/longitude to geometry
+from shapely.wkt import loads
+from pyproj import CRS
+
+''' Code to convert to functions
+# Store output files in set folder
+output_folder = "hua_workflow"
+programname = "IN_CORE_1cv2_Galveston_CleanBuildingInventory_2021-10-20"
+
+## Read in Building Inventory
+The building inventory provide basic understanding of where address points can be located.
+sys.path[0]
+
+# Galveston Building inventory
+filename = 'Nofal_Galveston_Buildings_2021-10-20\\Galveston_Bldgs_Points_Damage.shp'
+bldg_inv_gdf = gpd.read_file(filename)
+bldg_inv_gdf.head()
+
+
+
+# creating a geometry column 
+geometry = [Point(xy) for xy in zip(bldg_inv_gdf['X_Longit'], bldg_inv_gdf['Y_Latit'])]
+# Coordinate reference system : WGS84
+crs = CRS('epsg:4326')
+# Creating a Geographic data frame 
+bldg_inv_gdf = gpd.GeoDataFrame(bldg_inv_gdf, crs=crs, geometry=geometry)
+
+# bldg_inv_gdf[['F_Arch','Bldg_Area']].groupby(['F_Arch']).describe()
+# bldg_inv_gdf[['W_Arch','N_Units','Prop_Use','Occupancy']].head()
+# pd.pivot_table(bldg_inv_gdf, values='OBJECTID', index=['W_Arch','F_Arch'],
+#                           margins = True, margins_name = 'Total',
+#                           columns=['N_Units','Prop_Use'], aggfunc='count')
+
+## Check Unique Building ID
+# Building ID will be important for linking Address Point Inventory to Buildings and Critical Infrastructure Inventories to Buildings.
+
+## ID must be unique and non-missing.
+## Count the number of Unique Values
+# bldg_inv_gdf.OBJECTID.astype(str).describe()
+##  Count the number of Unique Values
+# bldg_inv_gdf.OBJECTID.nunique()
+# bldg_inv_gdf.loc[bldg_inv_gdf['OBJECTID'] == 568208]
+## Are there any missing values for the unique id?
+# bldg_inv_gdf.loc[bldg_inv_gdf.OBJECTID.isnull()]
+# Move Primary Key Column Building ID to first Column
+# cols = ['guid']  + [col for col in bldg_inv_gdf if col != 'guid']
+# cols
+# bldg_inv_gdf = bldg_inv_gdf[cols]
+# bldg_inv_gdf.head()
+# Confirm Building ID is Unique and Non-Missing
+#bldg_inv_gdf.guid.describe()
+## Read in Census Block Data
+# Census Blocks provide an estimate of how many residential 
+# address points (housing units) should be located in each block.
+
+source_file = 'tl_2010_48167_tabblockplacepuma10'
+census_blocks_csv = output_folder+"/"+source_file+"EPSG4269.csv"
+census_blocks_df = pd.read_csv(census_blocks_csv)
+census_blocks_gdf = gpd.GeoDataFrame(census_blocks_df)
+census_blocks_gdf.head()
+
+# Use shapely.wkt loads to convert WKT to GeoSeries
+census_blocks_gdf['geometry'] = census_blocks_gdf['geometry'].apply(lambda x: loads(x))
+#census_blocks_gdf['geometry'].geom_type.describe()
+census_blocks_gdf = census_blocks_gdf.set_geometry(census_blocks_gdf['geometry'])
+census_blocks_gdf.crs = CRS('epsg:4269')
+#census_blocks_gdf.head()
+## Check CRS for Building Centroid and Block
+# census_blocks_gdf.crs
+# bldg_inv_gdf.crs
+# Convert Census Block CRS to Buildings CRS
+census_blocks_gdf = census_blocks_gdf.to_crs(bldg_inv_gdf.crs)
+#census_blocks_gdf.crs
+
+# Check change in Geometry
+census_blocks_gdf['blk104269'] = /
+    census_blocks_gdf['blk104269'].apply(lambda x: loads(x))
+#census_blocks_gdf[['geometry','blk104269']]./
+#    loc[census_blocks_gdf['geometry'] != census_blocks_gdf['blk104269']]
+
+## Convert BLOCKID10 to a string
+census_blocks_gdf['BLOCKID10'] = /
+   census_blocks_gdf['GEOID10'].apply(lambda x : str(int(x)))
+
+## Add State, County, and Census Block ID to Each Footprint
+## Select Blocks within Bounding Box of Buildings
+# Find the bounds of the Buildings to select Census Blocks
+# Add Small Buffer for blocks on the edges
+buffer = 0.001
+minx = bldg_inv_gdf.bounds.minx.min() - buffer # subtract buffer from minimum values
+miny = bldg_inv_gdf.bounds.miny.min() - buffer # subtract buffer from minimum values
+maxx = bldg_inv_gdf.bounds.maxx.max() + buffer
+maxy = bldg_inv_gdf.bounds.maxy.max() + buffer
+building_gdf_bounds = [minx, miny, maxx, maxy]
+building_gdf_bounds
+
+# Select pumas within Bounds of Study Area
+# build the r-tree index - for census blocks
+sindex_census_blocks_gdf = census_blocks_gdf.sindex
+possible_matches_index = list(sindex_census_blocks_gdf.intersection(building_gdf_bounds))
+building_census_blocks_gdf = census_blocks_gdf.iloc[possible_matches_index]
+building_census_blocks_gdf['BLOCKID10'].describe()
+
+## Add Census Geography Details to Buildings
+# Significant help from: https://geoffboeing.com/2016/10/r-tree-spatial-index-python/
+# Significant help from: https://github.com/gboeing/urban-data-science/blob/master/19-Spatial-Analysis-and-Cartography/rtree-spatial-indexing.ipynb
+# build the r-tree index - Using Representative Point
+sindex_bldg_inv_gdf = bldg_inv_gdf.sindex
+sindex_bldg_inv_gdf
+
+# find the points that intersect with each subpolygon and add ID to Point
+for index, block in building_census_blocks_gdf.iterrows():
+    if index%100==0:
+        print('.', end ="")
+
+    # find approximate matches with r-tree, then precise matches from those approximate ones
+    possible_matches_index = list(sindex_bldg_inv_gdf.intersection(block['geometry'].bounds))
+    possible_matches = bldg_inv_gdf.iloc[possible_matches_index]
+    precise_matches = possible_matches[possible_matches.intersects(block['geometry'])]
+    bldg_inv_gdf.loc[precise_matches.index,'BLOCKID10'] = block['BLOCKID10']
+    bldg_inv_gdf.loc[precise_matches.index,'placeGEOID10'] = block['placeGEOID10']
+    bldg_inv_gdf.loc[precise_matches.index,'placeNAME10'] = block['placeNAME10']
+
+# Move Foreign Key Columns Block ID State, County, Tract to first Columns
+first_columns = ['OBJECTID','BLOCKID10','placeGEOID10','placeNAME10']
+cols = first_columns + [col for col in bldg_inv_gdf if col not in first_columns]
+bldg_inv_gdf = bldg_inv_gdf[cols]
+bldg_inv_gdf.head()
+
+### How many buildings do not have block id information?
+bldg_noblock_gdf = bldg_inv_gdf.loc[(bldg_inv_gdf['BLOCKID10'].isnull())]
+bldg_noblock_gdf
+
+# if there are missing buildings this code will help identify where they are -
+# every building should have a block
+# plot the building with missing block data
+# Find the bounds of the Census Block File
+minx = bldg_inv_gdf.bounds.minx.min()
+miny = bldg_inv_gdf.bounds.miny.min()
+maxx = bldg_inv_gdf.bounds.maxx.max()
+maxy = bldg_inv_gdf.bounds.maxy.max()
+
+blockstyle_function = lambda x: {'color':'green','fillColor': 'transparent' }
+
+bldg_inv_gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10)
+fm.GeoJson(bldg_noblock_gdf).add_to(bldg_inv_gdf_map)
+fm.GeoJson(census_blocks_gdf['geometry'],name='All Census Blocks',style_function=blockstyle_function).add_to(bldg_inv_gdf_map)
+fm.GeoJson(building_census_blocks_gdf['geometry'],name='Selected Census Blocks').add_to(bldg_inv_gdf_map)
+bldg_inv_gdf_map.save(output_folder + '/' + 'buildings_noblocks.html')
+# Error Displaying Map display(neosho_place_gdf_map)
+
+### Save Work
+# Check Columns
+cols = [col for col in bldg_inv_gdf]
+# Save Work at this point as CSV
+savefile = sys.path[0]+"/"+output_folder+"/"+programname+"_EPSG4326.csv"
+bldg_inv_gdf.to_csv(savefile)
+
+'''