In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#geographic plotting
import geopandas as gpd
from shapely.geometry import Point, Polygon
#interactive plotting
import plotly.graph_objects as go
import plotly.express as px
#bokeh
import bokeh

In [3]:
#change directory for your path
os.chdir('E:\\OneDrive\\Documents\\GitHub\\eHealthEquity\\Research\\Data Source')

In [4]:
df = pd.read_csv('acs5y_2021_brfss_2021_merged.csv')

index_list = [
    'state','county'
]
brfss_list = [
    col for col in df.columns if col.startswith('brfss_') 
    and not col.endswith('crdprv') 
    and not col.endswith('upper') 
    and not col.endswith('lower')
]
acs_list = [
    col for col in df.columns if col.startswith('pct_')
]

all_columns = index_list + brfss_list + acs_list

df = df[all_columns]

In [5]:
df.head()

Unnamed: 0,state,county,brfss_access2_ageadjprv,brfss_checkup_ageadjprv,brfss_csmoking_ageadjprv,brfss_depression_ageadjprv,brfss_diabetes_ageadjprv,brfss_ghlth_ageadjprv,brfss_mhlth_ageadjprv,brfss_obesity_ageadjprv,...,pct_occ_svc_upper,pct_occ_sales_lower,pct_occ_sales,pct_occ_sales_upper,pct_occ_nat_res_lower,pct_occ_nat_res,pct_occ_nat_res_upper,pct_occ_prod_lower,pct_occ_prod,pct_occ_prod_upper
0,Alaska,Aleutians East,16.9,60.7,18.6,13.1,12.6,19.1,12.9,31.3,...,0.157253,0.094894,0.116584,0.138274,0.100768,0.126525,0.152282,0.300045,0.386353,0.472662
1,Alaska,Aleutians West,13.7,61.6,15.5,13.4,10.9,15.3,12.2,30.3,...,0.18933,0.187141,0.232558,0.277975,0.080711,0.109439,0.138167,0.279891,0.34145,0.40301
2,Alaska,Anchorage,10.4,62.9,14.9,19.5,7.8,13.5,14.5,32.9,...,0.189401,0.200293,0.21001,0.219726,0.076038,0.082726,0.089413,0.104849,0.111143,0.117436
3,Alaska,Bethel,19.2,62.6,35.4,21.2,15.9,30.0,21.4,43.1,...,0.231484,0.154323,0.24357,0.332817,0.057174,0.073908,0.090641,0.103192,0.132631,0.16207
4,Alaska,Bristol Bay,10.3,61.4,18.7,18.8,8.6,14.4,15.4,35.6,...,0.164786,0.121896,0.17833,0.234763,0.063205,0.176072,0.288939,0.085779,0.158014,0.230248


In [16]:
#using county and state, find the geographic coordinates using geopandas

#first download the shapefile from the census
def download_shapefile(url):
    import requests, zipfile, io
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()

#2021 shapefile
url = 'https://www2.census.gov/geo/tiger/TIGER2021/COUNTY/tl_2021_us_county.zip'
download_shapefile(url)



In [17]:
#read in the shapefile
shapefile = gpd.read_file('tl_2021_us_county.shp')
shapefile.head()

#convert STATEFP using dictionary
state_dict = {
    '01':'Alabama',
    '02':'Alaska',
    '04':'Arizona',
    '05':'Arkansas',
    '06':'California',
    '08':'Colorado',
    '09':'Connecticut',
    '10':'Delaware',
    '11':'District of Columbia',
    '12':'Florida',
    '13':'Georgia',
    '15':'Hawaii',
    '16':'Idaho',
    '17':'Illinois',
    '18':'Indiana',
    '19':'Iowa',
    '20':'Kansas',
    '21':'Kentucky',
    '22':'Louisiana',
    '23':'Maine',
    '24':'Maryland',
    '25':'Massachusetts',
    '26':'Michigan',
    '27':'Minnesota',
    '28':'Mississippi',
    '29':'Missouri',
    '30':'Montana',
    '31':'Nebraska',
    '32':'Nevada',
    '33':'New Hampshire',
    '34':'New Jersey',
    '35':'New Mexico',
    '36':'New York',
    '37':'North Carolina',
    '38':'North Dakota',
    '39':'Ohio',
    '40':'Oklahoma',
    '41':'Oregon',
    '42':'Pennsylvania',
    '44':'Rhode Island',
    '45':'South Carolina',
    '46':'South Dakota',
    '47':'Tennessee',
    '48':'Texas',
    '49':'Utah',
    '50':'Vermont',
    '51':'Virginia',
    '53':'Washington',
    '54':'West Virginia',
    '55':'Wisconsin',
    '56':'Wyoming'
}

shapefile['state'] = shapefile['STATEFP'].map(state_dict)
shapefile.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,state
0,31,39,835841,31039,Cuming,Cuming County,6,H1,G4020,,,,A,1477645345,10690204,41.9158651,-96.7885168,"POLYGON ((-96.55515 41.91587, -96.55515 41.914...",Nebraska
1,53,69,1513275,53069,Wahkiakum,Wahkiakum County,6,H1,G4020,,,,A,680976231,61568965,46.2946377,-123.4244583,"POLYGON ((-123.49077 46.38358, -123.48813 46.3...",Washington
2,35,11,933054,35011,De Baca,De Baca County,6,H1,G4020,,,,A,6016818946,29090018,34.3592729,-104.3686961,"POLYGON ((-104.38368 34.69213, -104.37658 34.6...",New Mexico
3,31,109,835876,31109,Lancaster,Lancaster County,6,H1,G4020,339.0,30700.0,,A,2169272970,22847034,40.7835474,-96.6886584,"POLYGON ((-96.68140 41.04566, -96.68139 41.045...",Nebraska
4,31,129,835886,31129,Nuckolls,Nuckolls County,6,H1,G4020,,,,A,1489645185,1718484,40.1764918,-98.0468422,"POLYGON ((-98.04802 40.35066, -98.04674 40.350...",Nebraska


In [27]:
shapefile['NAME'] = shapefile['NAME'].str.replace(' Cty','')
df['county'] = df['county'].str.replace(' Cty','')

In [28]:
#assess matching between shapefile and df for state and county names
#count matching state and county combinations
shapefile['state_county'] = shapefile['state'] + shapefile['NAME']
df['state_county'] = df['state'] + df['county']
print(len(shapefile['state_county'].unique()))
print(len(df['state_county'].unique()))
#non-matching state and county combinations
print(len(set(df['state_county'].unique()) - set(shapefile['state_county'].unique())))

#diplay non-matching state and county combinations
set(df['state_county'].unique()) - set(shapefile['state_county'].unique())



3138
3070
3


{'NevadaCarson', 'VirginiaCharles', 'VirginiaJames'}

In [None]:
#create 