In [249]:
# This script adds the unique FIPs code to each county demographic entry scraped from indexmundi.
# The unique FIPs code will allow for merging this demographic data with the COVID-19 case data and 
# health care quality data.

In [250]:
import pandas as pd

In [251]:
# Read in the data scraped from index_mundi
demographics_df = pd.read_csv('scrape_demographics.csv')
# Read in the data called from the Census API
# https://api.census.gov/data/2010/dec/sf1?get=NAME&for=county:* => gives FIPS code for every county in the US
fips_df = pd.read_json('api_fips.json')
fips_df.columns = ['name', 'state_code', 'county_code']

In [252]:
#merge state and county code
fips_df['fips_code'] = fips_df['state_code'] + fips_df['county_code']
fips_df

Unnamed: 0,name,state_code,county_code,fips_code
0,NAME,state,county,statecounty
1,"Sebastian County, Arkansas",05,131,05131
2,"Sevier County, Arkansas",05,133,05133
3,"Sharp County, Arkansas",05,135,05135
4,"Stone County, Arkansas",05,137,05137
...,...,...,...,...
3217,"Eau Claire County, Wisconsin",55,035,55035
3218,"Florence County, Wisconsin",55,037,55037
3219,"Fond du Lac County, Wisconsin",55,039,55039
3220,"Forest County, Wisconsin",55,041,55041


In [253]:
#delete json header
fips_df = fips_df.drop(0)

#make name uppercase
fips_df['name'] = fips_df['name'].str.upper() 

fips_df

Unnamed: 0,name,state_code,county_code,fips_code
1,"SEBASTIAN COUNTY, ARKANSAS",05,131,05131
2,"SEVIER COUNTY, ARKANSAS",05,133,05133
3,"SHARP COUNTY, ARKANSAS",05,135,05135
4,"STONE COUNTY, ARKANSAS",05,137,05137
5,"UNION COUNTY, ARKANSAS",05,139,05139
...,...,...,...,...
3217,"EAU CLAIRE COUNTY, WISCONSIN",55,035,55035
3218,"FLORENCE COUNTY, WISCONSIN",55,037,55037
3219,"FOND DU LAC COUNTY, WISCONSIN",55,039,55039
3220,"FOREST COUNTY, WISCONSIN",55,041,55041


In [254]:
#Replace the '-' that is in two word states, ie 'West-Virginia' => 'West Virginia'
demographics_df['State'] = demographics_df['State'].replace(regex=['-'], value=' ')


In [255]:
demographics_df['full_name'] = demographics_df['County'] + ", " + demographics_df['State']
demographics_df

Unnamed: 0.1,Unnamed: 0,State,County,"Population estimates, July 1, 2019, (V2019)","Population, percent change - April 1, 2010 (estimates base) to July 1, 2019, (V2019)","Population estimates base, April 1, 2010, (V2019)","Population, Census, April 1, 2010","Persons under 5 years, percent","Persons under 18 years, percent","Persons 65 years and over, percent",...,"Nonveteran-owned firms, 2012","Black-owned firms, percent, 2007","American Indian- and Alaska Native-owned firms, percent, 2007","Asian-owned firms, percent, 2007","Native Hawaiian- and Other Pacific Islander-owned firms, percent, 2007","Hispanic-owned firms, percent, 2007","Women-owned firms, percent, 2007","Population per square mile, 2010","Land area in square miles, 2010",full_name
0,0,ALABAMA,AUTAUGA COUNTY,55601,1.9%,54574,54571,6.1%,23.7%,15.6%,...,2401,15.2%,0.0%,1.3%,0.0%,0.7%,31.7%,91.8,594.44,"AUTAUGA COUNTY, ALABAMA"
1,1,ALABAMA,BALDWIN COUNTY,218022,19.6%,182264,182265,5.5%,21.6%,20.4%,...,16183,2.7%,0.4%,1.0%,0.0%,1.3%,27.3%,114.6,1589.78,"BALDWIN COUNTY, ALABAMA"
2,2,ALABAMA,BARBOUR COUNTY,24881,-9.4%,27457,27457,5.2%,20.9%,19.4%,...,1445,0.0%,0.0%,0.0%,0.0%,0.0%,27.0%,31.0,884.88,"BARBOUR COUNTY, ALABAMA"
3,3,ALABAMA,BIBB COUNTY,22400,-2.3%,22920,22915,5.7%,20.5%,16.5%,...,967,14.9%,0.0%,0.0%,0.0%,0.0%,0.0%,36.8,622.58,"BIBB COUNTY, ALABAMA"
4,4,ALABAMA,BLOUNT COUNTY,57840,0.9%,57321,57322,6.0%,23.2%,18.2%,...,4030,0.0%,0.0%,0.0%,0.0%,0.0%,23.2%,88.9,644.78,"BLOUNT COUNTY, ALABAMA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,3138,WYOMING,SWEETWATER COUNTY,43051,-1.7%,43806,43806,6.9%,26.2%,12.1%,...,2890,0.0%,0.0%,0.7%,0.0%,3.8%,27.2%,4.2,10426.65,"SWEETWATER COUNTY, WYOMING"
3139,3139,WYOMING,TETON COUNTY,23081,8.4%,21298,21294,4.9%,18.4%,15.4%,...,5362,0.0%,0.0%,0.5%,0.0%,3.3%,25.3%,5.3,3995.38,"TETON COUNTY, WYOMING"
3140,3140,WYOMING,UINTA COUNTY,20299,-3.9%,21121,21118,7.3%,28.8%,14.1%,...,1651,0.0%,0.0%,0.0%,0.0%,2.2%,15.9%,10.1,2081.26,"UINTA COUNTY, WYOMING"
3141,3141,WYOMING,WASHAKIE COUNTY,7885,-7.5%,8528,8533,5.1%,22.7%,21.7%,...,685,0.0%,0.0%,0.0%,0.0%,0.0%,26.9%,3.8,2238.55,"WASHAKIE COUNTY, WYOMING"


In [256]:
# Many places have cities instead of counties
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=['CITY COUNTY,'], value='CITY,')

# Except for two places in VA
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=['CHARLES CITY, VIRGINIA'], value='CHARLES CITY COUNTY, VIRGINIA')
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=['JAMES CITY, VIRGINIA'], value='JAMES CITY COUNTY, VIRGINIA')


# Many places have parishes instead of counties
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=['PARISH COUNTY,'], value='PARISH,')

# Alaska doesn't have counties
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=[' COUNTY, ALASKA'], value=', ALASKA')

# Represent the ñ as ? because python can't handle it
demographics_df['full_name'] = demographics_df['full_name'].replace(regex=['DO±A ANA COUNTY, NEW MEXICO'], value='DO?A ANA COUNTY, NEW MEXICO')

# Drop where State and County are the same, mistake in scraping 
identical_indices = demographics_df[demographics_df['State']==demographics_df['County']].index
demographics_df.drop(identical_indices, inplace=True)


In [257]:
# join the two dataframes with a left join
# this will keep every row in the left dataframe (demographics data). When there are missing values of the 'on variable'
encoded_df = pd.merge(demographics_df, fips_df[['fips_code', 'name']], left_on='full_name', right_on='name', how='left')

In [258]:
encoded_df.drop(['Unnamed: 0', 'name'], axis=1, inplace=True)

encoded_df

Unnamed: 0,State,County,"Population estimates, July 1, 2019, (V2019)","Population, percent change - April 1, 2010 (estimates base) to July 1, 2019, (V2019)","Population estimates base, April 1, 2010, (V2019)","Population, Census, April 1, 2010","Persons under 5 years, percent","Persons under 18 years, percent","Persons 65 years and over, percent","Female persons, percent",...,"Black-owned firms, percent, 2007","American Indian- and Alaska Native-owned firms, percent, 2007","Asian-owned firms, percent, 2007","Native Hawaiian- and Other Pacific Islander-owned firms, percent, 2007","Hispanic-owned firms, percent, 2007","Women-owned firms, percent, 2007","Population per square mile, 2010","Land area in square miles, 2010",full_name,fips_code
0,ALABAMA,AUTAUGA COUNTY,55601,1.9%,54574,54571,6.1%,23.7%,15.6%,51.4%,...,15.2%,0.0%,1.3%,0.0%,0.7%,31.7%,91.8,594.44,"AUTAUGA COUNTY, ALABAMA",01001
1,ALABAMA,BALDWIN COUNTY,218022,19.6%,182264,182265,5.5%,21.6%,20.4%,51.5%,...,2.7%,0.4%,1.0%,0.0%,1.3%,27.3%,114.6,1589.78,"BALDWIN COUNTY, ALABAMA",01003
2,ALABAMA,BARBOUR COUNTY,24881,-9.4%,27457,27457,5.2%,20.9%,19.4%,47.2%,...,0.0%,0.0%,0.0%,0.0%,0.0%,27.0%,31.0,884.88,"BARBOUR COUNTY, ALABAMA",01005
3,ALABAMA,BIBB COUNTY,22400,-2.3%,22920,22915,5.7%,20.5%,16.5%,46.8%,...,14.9%,0.0%,0.0%,0.0%,0.0%,0.0%,36.8,622.58,"BIBB COUNTY, ALABAMA",01007
4,ALABAMA,BLOUNT COUNTY,57840,0.9%,57321,57322,6.0%,23.2%,18.2%,50.7%,...,0.0%,0.0%,0.0%,0.0%,0.0%,23.2%,88.9,644.78,"BLOUNT COUNTY, ALABAMA",01009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3130,WYOMING,SWEETWATER COUNTY,43051,-1.7%,43806,43806,6.9%,26.2%,12.1%,48.5%,...,0.0%,0.0%,0.7%,0.0%,3.8%,27.2%,4.2,10426.65,"SWEETWATER COUNTY, WYOMING",56037
3131,WYOMING,TETON COUNTY,23081,8.4%,21298,21294,4.9%,18.4%,15.4%,48.4%,...,0.0%,0.0%,0.5%,0.0%,3.3%,25.3%,5.3,3995.38,"TETON COUNTY, WYOMING",56039
3132,WYOMING,UINTA COUNTY,20299,-3.9%,21121,21118,7.3%,28.8%,14.1%,49.3%,...,0.0%,0.0%,0.0%,0.0%,2.2%,15.9%,10.1,2081.26,"UINTA COUNTY, WYOMING",56041
3133,WYOMING,WASHAKIE COUNTY,7885,-7.5%,8528,8533,5.1%,22.7%,21.7%,49.4%,...,0.0%,0.0%,0.0%,0.0%,0.0%,26.9%,3.8,2238.55,"WASHAKIE COUNTY, WYOMING",56043


In [259]:
# Assert that every entry in demographic data has been encoded
assert(len(encoded_df[encoded_df['fips_code'].isnull()])==0)

In [260]:
# Save new csv file
encoded_df.to_csv('fips_encoded_demographics.csv')