Here we import data from a few spreadsheets, strip it and combine it. The code here is written to run in Google Colab (with files stored in Drive) but it can be modified to run locally.

In [None]:
#hello
import geopandas as gpd
import pandas as pd
!pip install mapclassify

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [110]:
health = 'drive/MyDrive/walkability/health.csv'      # https://data.cdc.gov/500-Cities-Places/500-Cities-Local-Data-for-Better-Health-2019-relea/6vp6-wxuq/about_data
epacsv = 'drive/MyDrive/walkability/epdownload.csv'  # https://edg.epa.gov/EPADataCommons/public/OA/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv
incomesource = 'drive/MyDrive/walkability/ACSDT5Y2017.B19013-2024-03-26T221220.csv'      # https://data.census.gov/table/ACSDT5Y2017.B19013?q=b19013&g=040XX00US11$1400000&moe=false&tp=true
income = 'drive/MyDrive/walkability/income.csv'      # we will store the modified csv here

In [None]:
with open(incomesource, 'r') as incomefile:
  incomestr = incomefile.read().replace(
      ', District of Columbia, District of Columbia",""\n"    Estimate', '').replace(
          'Median household income in the past 12 months (in 2017 inflation-adjusted dollars)', 'Income').replace(
              '\ufeff"Label (Grouping)"', 'Tract').replace(
                  'Census Tract ', '')
with open(income, 'w') as incomefile:
  incomefile.write(incomestr)

inc = gpd.read_file(income).drop('geometry', axis=1)
inc['Income'] = inc['Income'].str.replace(',','')
inc.apply(pd.to_numeric, errors='coerce')
inc['Tract'] = round(inc['Tract'].astype(float)*100).astype(int)
inc.set_index('Tract', inplace=True)
inc

In [None]:
columns_to_drop = ['D1A', 'D1B', 'D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC',
                   'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT',
                   'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB', 'D1D', 'D1_FLAG', 'D2A_JPHH',
                   'D2B_E5MIX', 'D2B_E5MIXA', 'D2B_E8MIX', 'D2B_E8MIXA', 'D2A_EPHHM',
                   'D2C_TRPMX1', 'D2C_TRPMX2', 'D2C_TRIPEQ', 'D2R_JOBPOP', 'D2R_WRKEMP',
                   'D2A_WRKEMP', 'D2C_WREMLX', 'D4D', 'D4E', 'D5CR', 'D5CRI', 'D5CE',
                   'D5CEI', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI',
                   'D2A_Ranked', 'D2B_Ranked', 'D3B_Ranked', 'D4A_Ranked'
                   ]
epa = gpd.read_file(epacsv, where="STATEFP='11'").drop(columns=columns_to_drop)   # only interested in DC
epa['TRACTCE'] = epa['TRACTCE'].astype(int)
epa.set_index('OBJECTID', inplace=True)
epa

In [None]:
#gdb = 'drive/MyDrive/smartlocation/smartlocdb/SmartLocationDatabase.gdb'
#fc = gpd.read_file(gdb, where="STATEFP='11'")

In [None]:
hl = gpd.read_file(health, where="StateAbbr='DC' AND GeographicLevel='Census Tract'", include_fields=['StateAbbr', 'GeographicLevel', 'UniqueID', 'MeasureId', 'CityFIPS', 'TractFIPS', 'Data_Value'])
hl['TractCE'] = hl['TractFIPS'].str.removeprefix('110010').astype(int)
hlp = hl.pivot(index='TractCE', columns='MeasureId', values='Data_Value')
hlp

In [None]:
firstjoin = epa.join(other=hlp, on='TRACTCE', how='left', validate='m:1')           # merge EPA and CDC
secondjoin = firstjoin.join(other=inc, on='TRACTCE', how='left', validate='m:1')    # merge above with census income data
secondjoin