In [1]:
##import packages used in this session
import pandas as pd
from random import sample

## Data collection and cleaning

### COVID-19 cases by county, race, and ethnicity
This data source was obtained from the New York Times public GitHub repository. The unit of observation is state; this data frame reports the number of reported COVID-19 cases by race and ethnicity for counties that provided this information to the CDC. The timeframe is from the beginning of the pandemic until the end of May 2020. https://github.com/nytimes/covid-19-data/issues/381

In [3]:
##import NYT data 
county_data=pd.read_csv("Data/data.csv",dtype={'fips': object})

In [4]:
county_data.shape

(974, 23)

There are 974 counties represented in this data. There are approximately 3,007 counties in the United States. 

In [5]:
## column headers
county_data.columns

Index(['fips', 'state', 'county', 'cases', 'white_cases', 'black_cases',
       'hispanic_cases', 'native_cases', 'asian_cases', 'white_rate',
       'black_rate', 'hispanic_rate', 'native_rate', 'asian_rate', 'known_pct',
       'pop_white', 'pop_black', 'pop_hispanic', 'pct_white', 'pct_black',
       'pct_hispanic', 'pct_asian', 'pct_native'],
      dtype='object')

The NYT data set contains pre-calculated COVID-19 case rates amongst different racial and ethnic populations and also contains population data for white, black, and hispanic populations in each of the 974 counties.Note the population data for asian and native american populations is not included. Since the outcome variable for the statistical learning componenet will rely on population data for all minority populations represented in this data (black, hispanic, native american, and asian), only the COVID-19 cases will remain in this data set, and population and COVID-19 rates for each racial/ethnic category in each county will be calculated later on using the most recent population data from the U.S. Census. 

In [6]:
## keep relevant columsn(i.e., county fips code; state; county; total cases; cases by race)
county_data=county_data.filter(items=['fips', 'state', 'county', 'cases', 'white_cases', 'black_cases',
       'hispanic_cases', 'native_cases', 'asian_cases'])

In [7]:
county_data.head(10)

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases
0,1001,Alabama,Autauga County,119,44,21,0,0,0
1,1003,Alabama,Baldwin County,201,144,6,0,0,0
2,1005,Alabama,Barbour County,91,5,32,0,0,0
3,1007,Alabama,Bibb County,50,17,5,0,0,0
4,1009,Alabama,Blount County,37,15,0,0,0,0
5,1011,Alabama,Bullock County,125,0,106,0,0,0
6,1013,Alabama,Butler County,263,65,166,0,0,0
7,1015,Alabama,Calhoun County,124,72,23,0,0,0
8,1017,Alabama,Chambers County,301,81,181,0,0,0
9,1021,Alabama,Chilton County,65,27,0,0,0,0


For ease of analysis and visualization later on, the state names will be converted to two-letter abbreviations. A publicly available dictionary, created and made available by Roger Allen (https://gist.github.com/rogerallen/1583593), will be used for this conversion.

In [8]:
##use code provided by roger allen
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [9]:
##convert state names to two-letter abbreviations
county_data['state'] = county_data['state'].apply(us_state_abbrev.get)

In [10]:
##view dataframe; confirm changes.
county_data.sample(20)

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases
204,13121,GA,Fulton County,2196,121,441,0,0,0
5,1011,AL,Bullock County,125,0,106,0,0,0
825,42133,PA,York County,968,171,43,179,0,0
918,53005,WA,Benton County,600,215,0,183,0,16
475,27067,MN,Kandiyohi County,457,61,74,84,0,0
38,1095,AL,Marshall County,539,160,0,281,0,0
518,28045,MS,Hancock County,62,21,0,0,0,0
669,37067,NC,Forsyth County,1065,148,120,485,0,55
841,45045,SC,Greenville County,1125,382,244,173,0,0
206,13129,GA,Gordon County,71,31,0,0,0,0


In [11]:
##remove leading zeros from fips code
county_data['fips'] = [ i.lstrip('0') for i in county_data['fips'] ]

In [12]:
## check dtypes
county_data.dtypes

fips              object
state             object
county            object
cases              int64
white_cases        int64
black_cases        int64
hispanic_cases     int64
native_cases       int64
asian_cases        int64
dtype: object

### Population: https://www.census.gov/newsroom/press-kits/2020/population-estimates-detailed.html
NYT missing population data for Native American and Asian population.

In [13]:
##population data
county_pop=pd.read_csv("Data/cc-est2019-alldata.csv", encoding='ISO-8859-1', dtype={'COUNTY': object,'STATE': object})

In [14]:
## based on the code book,  2019 is coded as 12 under column YEAR
county_pop=county_pop.loc[county_pop['YEAR'] == 12]

In [15]:
## population estimate for all ages is coded as 0 under column AGEGRP
county_pop=county_pop.loc[county_pop['AGEGRP'] == 0]

In [16]:
## keep only relevant columns
county_pop=county_pop.filter(items=['STATE','COUNTY', 'STNAME', 'CTYNAME','TOT_POP','H_MALE', 'H_FEMALE',
                                    'WA_MALE', 'WA_FEMALE', 'BA_MALE', 'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 
                                    'AA_MALE', 'AA_FEMALE', 'NA_MALE', 'NA_FEMALE','TOM_MALE', 'TOM_FEMALE' ])

In [17]:
county_pop

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,TOT_POP,H_MALE,H_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE
209,01,001,Alabama,Autauga County,55869,884,787,20878,21729,5237,6000,121,145,286,370,32,26,538,507
437,01,003,Alabama,Baldwin County,223234,5545,4989,94810,100388,9486,10107,903,839,932,1448,74,80,2042,2125
665,01,005,Alabama,Barbour County,24686,629,488,6389,5745,6311,5595,103,67,55,61,34,18,172,136
893,01,007,Alabama,Bibb County,22394,343,280,8766,8425,2941,1822,53,50,23,25,22,4,124,139
1121,01,009,Alabama,Blount County,57826,2950,2632,27258,28154,516,462,192,178,85,100,42,25,379,435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715445,56,037,Wyoming,Sweetwater County,42343,3551,3221,20446,19252,347,251,324,302,203,250,35,34,453,446
715673,56,039,Wyoming,Teton County,23464,1884,1670,11567,10718,101,71,106,102,143,252,20,13,205,166
715901,56,041,Wyoming,Uinta County,20226,927,944,9753,9524,77,75,135,157,38,62,17,14,204,170
716129,56,043,Wyoming,Washakie County,7805,565,543,3759,3618,25,19,63,75,25,39,4,2,87,89


In [18]:
##combine state and county fips to make consistent with NYT dataframe
county_pop['fips'] = county_pop[['STATE', 'COUNTY']].apply(lambda x: ''.join(x), axis=1)
##remove leading zeros from fips codes
county_pop['fips'] = [ i.lstrip('0') for i in county_pop['fips'] ]

In [19]:
## convert state names to two-letter abbreviations
county_pop['STNAME'] = county_pop['STNAME'].apply(us_state_abbrev.get)

In [20]:
##sum male and female population to get total population for all races and ethnicities 
county_pop['WH']=county_pop['WA_MALE']+county_pop['WA_FEMALE']
county_pop['BA']=county_pop['BA_MALE']+county_pop['BA_FEMALE']
county_pop['AA']=county_pop['AA_MALE']+county_pop['AA_FEMALE']
county_pop['IA']=county_pop['IA_MALE']+county_pop['IA_FEMALE']
county_pop['HI']=county_pop['H_MALE']+county_pop['H_FEMALE']

In [21]:
##keep relevant columns for merging
county_pop=county_pop.filter(items=['fips','TOT_POP','WH','BA','AA','IA','HI'])

In [22]:
##view data frame.
county_pop

Unnamed: 0,fips,TOT_POP,WH,BA,AA,IA,HI
209,1001,55869,42607,11237,656,266,1671
437,1003,223234,195198,19593,2380,1742,10534
665,1005,24686,12134,11906,116,170,1117
893,1007,22394,17191,4763,48,103,623
1121,1009,57826,55412,978,185,370,5582
...,...,...,...,...,...,...,...
715445,56037,42343,39698,598,453,626,6772
715673,56039,23464,22285,172,395,208,3554
715901,56041,20226,19277,152,100,292,1871
716129,56043,7805,7377,44,64,138,1108


### Economic indicators

#### County GDP
County level GDP data is obtained from the Bureau of Economic Analysis at the U.S. Department of Commerce. The metric selected is GDP by county and metropolitan area across all industries. This data was last updated on December 12, 2019.
https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1&acrdn=5#reqid=70&step=1&isuri=1&acrdn=5

In [23]:
gdp=pd.read_csv("Data/GDP_county - GDP_county.csv")

In [24]:
gdp.tail(11)

Unnamed: 0,GeoFips,GeoName,2018
3115,56045,"Weston, WY",318545.0
3116,Legend / Footnotes:,,
3117,1/ Gross Domestic Product (GDP) is in thousand...,,
3118,"* Broomfield County, CO, was created from part...",,
3119,* Estimates from 2008 forward separate Skagway...,,
3120,* Virginia combination areas consist of one or...,,
3121,"* Shannon County, SD was renamed to Oglala Lak...",,
3122,"* Kalawao County, Hawaii is combined with Maui...",,
3123,Metropolitan Areas are defined (geographically...,,
3124,(NA) Not available.,,


Tail of the data frame includes irrelevant rows that needs to be removed before merging this dataframe with the county level COVID-19 cases dataframe.

In [25]:
##remove irrelevant rows
gdp.drop(gdp.tail(10).index, inplace = True) 

In [26]:
## confirm removal
gdp.tail(11)

Unnamed: 0,GeoFips,GeoName,2018
3105,56025,"Natrona, WY",5672135
3106,56027,"Niobrara, WY",130556
3107,56029,"Park, WY",1460391
3108,56031,"Platte, WY",577915
3109,56033,"Sheridan, WY",1397518
3110,56035,"Sublette, WY",1245979
3111,56037,"Sweetwater, WY",3880016
3112,56039,"Teton, WY",2505534
3113,56041,"Uinta, WY",927537
3114,56043,"Washakie, WY",379984


The column headers need to be renamed to maintain consistency across all data frames that will be merged later on. 

In [27]:
##rename column headers
gdp = gdp.rename(columns={'GeoFips':'fips','2018': 'GDP', 'GeoName':'county'})    

In [28]:
##keep only relevant columns
gdp=gdp.filter(items=['fips','GDP','county'])

In [29]:
gdp

Unnamed: 0,fips,GDP,county
0,1001,1690937,"Autauga, AL"
1,1003,6606080,"Baldwin, AL"
2,1005,851956,"Barbour, AL"
3,1007,424510,"Bibb, AL"
4,1009,942904,"Blount, AL"
...,...,...,...
3111,56037,3880016,"Sweetwater, WY"
3112,56039,2505534,"Teton, WY"
3113,56041,927537,"Uinta, WY"
3114,56043,379984,"Washakie, WY"


Unlike the NYT data set, the county and state names are in a single column. This will be remedied by creating a separate column for states. The county column will have the word 'county' added to all observations to maintain consistency with the NYT data set.

In [30]:
## create a separate state column
gdp['state']=gdp['county'].str.split(",").str[1]

In [31]:
##retain only county names in the county column
gdp['county']= gdp['county'].str.split(",").str[0].astype(str)

Unlike the NYT dataframe, the county column in this dataframe does not have the word county. To remedy this, first check if any of the observations in the NYT dataframe do not have the word county. 

In [32]:
##query counties in NYT dataframe without the word county
county_exclude=county_data[~county_data['county'].str.contains(r'County')]
county_exclude=county_exclude['county']
county_exclude

56           Anchorage Municipality
57     Fairbanks North Star Borough
58          Juneau City and Borough
59        Ketchikan Gateway Borough
916                      Salem city
Name: county, dtype: object

In [33]:
## add 'county' to  observations in the county column, excluding the ones queried above
gdp['county'] = gdp['county']+' County'  

In [34]:
##keep only relevant columns for merging
gdp=gdp.filter(items=['fips','GDP'])

In [35]:
## view data frame; confirm changes
gdp

Unnamed: 0,fips,GDP
0,1001,1690937
1,1003,6606080
2,1005,851956
3,1007,424510
4,1009,942904
...,...,...
3111,56037,3880016
3112,56039,2505534
3113,56041,927537
3114,56043,379984


#### County poverty and median household income
Poverty (poverty estimate for all ages and percent in poverty for all ages) and median household income by county was obtained from  U.S. Census Bureau, Small Area Income and Poverty Estimates (SAIPE) Program (i.e.,2018 Poverty and Median Household Income Estimates). This data was last revised on December 2019.
https://www.census.gov/data/datasets/2018/demo/saipe/2018-state-and-county.html

In [36]:
##import data;read FIPS code columns as dtype object to maintain traiiling zeroes.
pov_inc=pd.read_csv("Data/est18all.xlsx - est18ALL.csv",skiprows=1, dtype={'County FIPS Code': object,'State FIPS Code': object})

In [37]:
##examine shape of data
pov_inc.shape

(3194, 31)

In [38]:
##view random sample of dataframe to understand the distribution of the data
pov_inc.sample(10)

Unnamed: 0,State FIPS Code,County FIPS Code,Postal Code,Name,"Poverty Estimate, All Ages",90% CI Lower Bound,90% CI Upper Bound,"Poverty Percent, All Ages",90% CI Lower Bound.1,90% CI Upper Bound.1,...,90% CI Upper Bound.5,Median Household Income,90% CI Lower Bound.6,90% CI Upper Bound.6,"Poverty Estimate, Age 0-4",90% CI Lower Bound.7,90% CI Upper Bound.7,"Poverty Percent, Age 0-4",90% CI Lower Bound.8,90% CI Upper Bound.8
2652,48,169,TX,Garza County,1096,810,1382,24.6,18.2,31.0,...,33.4,46490,41777,51203,.,.,.,.,.,.
2991,51,750,VA,Radford city,4513,3694,5332,30.4,24.9,35.9,...,21.8,39254,34914,43594,.,.,.,.,.,.
3094,54,105,WV,Wirt County,1086,841,1331,18.7,14.5,22.9,...,33.1,44837,40086,49588,.,.,.,.,.,.
1724,31,83,NE,Harlan County,371,281,461,11.1,8.4,13.8,...,19.1,49321,44114,54528,.,.,.,.,.,.
1981,37,113,NC,Macon County,5700,4701,6699,16.3,13.4,19.2,...,32.8,46426,42181,50671,.,.,.,.,.,.
186,5,141,AR,Van Buren County,2685,2045,3325,16.4,12.5,20.3,...,33.5,40486,37280,43692,.,.,.,.,.,.
1550,29,81,MO,Harrison County,1277,1017,1537,15.5,12.4,18.6,...,26.5,40639,36328,44950,.,.,.,.,.,.
1461,28,69,MS,Kemper County,2427,1858,2996,27.5,21.1,33.9,...,46.0,33437,31167,35707,.,.,.,.,.,.
2938,51,147,VA,Prince Edward County,3782,2807,4757,20.0,14.8,25.2,...,36.4,48450,44195,52705,.,.,.,.,.,.
64,1,125,AL,Tuscaloosa County,33330,28795,37865,16.8,14.5,19.1,...,24.9,52557,49017,56097,.,.,.,.,.,.


In [39]:
## examine column headers
pov_inc.columns

Index(['State FIPS Code', 'County FIPS Code', 'Postal Code', 'Name',
       'Poverty Estimate, All Ages', '90% CI Lower Bound',
       '90% CI Upper Bound', 'Poverty Percent, All Ages',
       '90% CI Lower Bound.1', '90% CI Upper Bound.1',
       'Poverty Estimate, Age 0-17', '90% CI Lower Bound.2',
       '90% CI Upper Bound.2', 'Poverty Percent, Age 0-17',
       '90% CI Lower Bound.3', '90% CI Upper Bound.3',
       'Poverty Estimate, Age 5-17 in Families', '90% CI Lower Bound.4',
       '90% CI Upper Bound.4', 'Poverty Percent, Age 5-17 in Families',
       '90% CI Lower Bound.5', '90% CI Upper Bound.5',
       'Median Household Income', '90% CI Lower Bound.6',
       '90% CI Upper Bound.6', 'Poverty Estimate, Age 0-4',
       '90% CI Lower Bound.7', '90% CI Upper Bound.7',
       'Poverty Percent, Age 0-4', '90% CI Lower Bound.8',
       '90% CI Upper Bound.8'],
      dtype='object')

For the purpose of this analysis, only the 'Poverty Percent,All ages' and the 'Median Household Income' variables will be used.

In [40]:
##keep only relevant columns
pov_inc=pov_inc.filter(items=['State FIPS Code','Name','County FIPS Code','Poverty Percent, All Ages','Median Household Income'])

The format of the county fips code in this data is shortened when compared to the above data frame. The state fips code and county fips code will be combined below to create a consistent fips code. 

In [41]:
##join county and state fips codes
pov_inc['fips'] = pov_inc[['State FIPS Code', 'County FIPS Code']].apply(lambda x: ''.join(x), axis=1)
##remove leading zeros from fips codes
pov_inc['fips'] = [ i.lstrip('0') for i in pov_inc['fips'] ]

In [42]:
##view dataframe to confirm creation of new column
pov_inc.sample(10)

Unnamed: 0,State FIPS Code,Name,County FIPS Code,"Poverty Percent, All Ages",Median Household Income,fips
2736,48,Montague County,337,13.8,50542,48337
572,16,Boise County,15,12.8,56837,16015
396,12,Walton County,131,11.4,60973,12131
2066,38,Sargent County,81,7.8,64756,38081
78,2,Haines Borough,100,10.5,58943,2100
1444,28,Forrest County,35,24.0,39061,28035
2534,47,Montgomery County,125,12.0,56102,47125
2786,48,Swisher County,437,19.2,43084,48437
950,20,Jewell County,89,15.0,44271,20089
746,18,Howard County,67,12.3,55365,18067


Since this data frame does not have state names included, the state fips code column will be converted to 
corresponding two-letter state name. To do this, a dictionary of state fips code and state names will be generated from the dataframe above.

In [43]:
##isolate state fips code and name column for states only into a separate dataframe
fips_conv=pov_inc.loc[pov_inc['County FIPS Code'] == '000']##note:county fips code =='000' for states
fips_conv=fips_conv.filter(items=['State FIPS Code','Name'])##keep relevant columns
fips_conv

Unnamed: 0,State FIPS Code,Name
0,0,United States
1,1,Alabama
69,2,Alaska
99,4,Arizona
115,5,Arkansas
191,6,California
250,8,Colorado
315,9,Connecticut
324,10,Delaware
328,11,District of Columbia


In [44]:
##reshape data frame from long to wide 
fips_conv = fips_conv.set_index("State FIPS Code").T

In [45]:
##view reshaped dataframe
fips_conv

State FIPS Code,00,01,02,04,05,06,08,09,10,11,...,46,47,48,49,50,51,53,54,55,56
Name,United States,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming


In [46]:
##convert to list dictionary
fips_conv = fips_conv.to_dict('r')

In [47]:
## view list item
fips_conv[0]

{'00': 'United States',
 '01': 'Alabama',
 '02': 'Alaska',
 '04': 'Arizona',
 '05': 'Arkansas',
 '06': 'California',
 '08': 'Colorado',
 '09': 'Connecticut',
 '10': 'Delaware',
 '11': 'District of Columbia',
 '12': 'Florida',
 '13': 'Georgia',
 '15': 'Hawaii',
 '16': 'Idaho',
 '17': 'Illinois',
 '18': 'Indiana',
 '19': 'Iowa',
 '20': 'Kansas',
 '21': 'Kentucky',
 '22': 'Louisiana',
 '23': 'Maine',
 '24': 'Maryland',
 '25': 'Massachusetts',
 '26': 'Michigan',
 '27': 'Minnesota',
 '28': 'Mississippi',
 '29': 'Missouri',
 '30': 'Montana',
 '31': 'Nebraska',
 '32': 'Nevada',
 '33': 'New Hampshire',
 '34': 'New Jersey',
 '35': 'New Mexico',
 '36': 'New York',
 '37': 'North Carolina',
 '38': 'North Dakota',
 '39': 'Ohio',
 '40': 'Oklahoma',
 '41': 'Oregon',
 '42': 'Pennsylvania',
 '44': 'Rhode Island',
 '45': 'South Carolina',
 '46': 'South Dakota',
 '47': 'Tennessee',
 '48': 'Texas',
 '49': 'Utah',
 '50': 'Vermont',
 '51': 'Virginia',
 '53': 'Washington',
 '54': 'West Virginia',
 '55': 'Wis

In [48]:
##convert list to dictionary
fips_conv=dict(fips_conv[0])

In [49]:
##convert fips code to full state name using dictionary
pov_inc['State FIPS Code'] = pov_inc['State FIPS Code'].apply(fips_conv.get)
##view dataframe; confirm change
pov_inc

Unnamed: 0,State FIPS Code,Name,County FIPS Code,"Poverty Percent, All Ages",Median Household Income,fips
0,United States,United States,000,13.1,61937,
1,Alabama,Alabama,000,16.8,49881,1000
2,Alabama,Autauga County,001,13.8,59338,1001
3,Alabama,Baldwin County,003,9.8,57588,1003
4,Alabama,Barbour County,005,30.9,34382,1005
...,...,...,...,...,...,...
3189,Wyoming,Sweetwater County,037,8.4,73315,56037
3190,Wyoming,Teton County,039,6.3,99087,56039
3191,Wyoming,Uinta County,041,10.0,63401,56041
3192,Wyoming,Washakie County,043,11.9,55190,56043


In [50]:
## convert state name to two-letters; use dictionary from used for NYT dataframe
pov_inc['State FIPS Code'] = pov_inc['State FIPS Code'].apply(us_state_abbrev.get)
##view dataframe; confirm change
pov_inc

Unnamed: 0,State FIPS Code,Name,County FIPS Code,"Poverty Percent, All Ages",Median Household Income,fips
0,,United States,000,13.1,61937,
1,AL,Alabama,000,16.8,49881,1000
2,AL,Autauga County,001,13.8,59338,1001
3,AL,Baldwin County,003,9.8,57588,1003
4,AL,Barbour County,005,30.9,34382,1005
...,...,...,...,...,...,...
3189,WY,Sweetwater County,037,8.4,73315,56037
3190,WY,Teton County,039,6.3,99087,56039
3191,WY,Uinta County,041,10.0,63401,56041
3192,WY,Washakie County,043,11.9,55190,56043


In [51]:
## keep only relevant columns
pov_inc=pov_inc.filter(items=['fips','Poverty Percent, All Ages','Median Household Income'])
 

In [52]:
## rename column headers for consistency across data frames
pov_inc = pov_inc.rename(columns={'Poverty Percent, All Ages': 'pov_perc',
                                  'Median Household Income': 'Med_inc'})   

In [53]:
##view data frame
pov_inc

Unnamed: 0,fips,pov_perc,Med_inc
0,,13.1,61937
1,1000,16.8,49881
2,1001,13.8,59338
3,1003,9.8,57588
4,1005,30.9,34382
...,...,...,...
3189,56037,8.4,73315
3190,56039,6.3,99087
3191,56041,10.0,63401
3192,56043,11.9,55190


#### Unemployment Rate
County level unemployment rate is obtained from the U.S. Bureau of Labor Statistics. The data table below represents 2019 annual average labor force data by county.  
https://www.bls.gov/lau/tables.htm

In [54]:
##import data; read FIPS code columns as dtype object to maintain traiiling zeroes.
unemp=pd.read_csv("Data/laucnty19.xlsx - laucnty19-2.csv", dtype={'County FIPS Code': object,'State FIPSCode': object})

In [55]:
unemp

Unnamed: 0,Code,State FIPSCode,County FIPS Code,County Name/State Abbreviation,Year,Labor Force,Employed,Unemployed,Unemployment Rate
0,CN0100100000000,01,001,"Autauga County, AL",2019,26172,25458,714,2.7
1,CN0100300000000,01,003,"Baldwin County, AL",2019,97328,94675,2653,2.7
2,CN0100500000000,01,005,"Barbour County, AL",2019,8537,8213,324,3.8
3,CN0100700000000,01,007,"Bibb County, AL",2019,8685,8419,266,3.1
4,CN0100900000000,01,009,"Blount County, AL",2019,25331,24655,676,2.7
...,...,...,...,...,...,...,...,...,...
3214,CN7214500000000,72,145,"Vega Baja Municipio, PR",2019,13037,11791,1246,9.6
3215,CN7214700000000,72,147,"Vieques Municipio, PR",2019,2585,2406,179,6.9
3216,CN7214900000000,72,149,"Villalba Municipio, PR",2019,7406,6231,1175,15.9
3217,CN7215100000000,72,151,"Yabucoa Municipio, PR",2019,8691,7552,1139,13.1


In [56]:
##keep only relevant columns
unemp=unemp.filter(items=['State FIPSCode','County FIPS Code','Unemployment Rate'])

In [57]:
unemp

Unnamed: 0,State FIPSCode,County FIPS Code,Unemployment Rate
0,01,001,2.7
1,01,003,2.7
2,01,005,3.8
3,01,007,3.1
4,01,009,2.7
...,...,...,...
3214,72,145,9.6
3215,72,147,6.9
3216,72,149,15.9
3217,72,151,13.1


Similar to the previous data frames, the county and state names are combined into a single column and the state and fips code are in a separate column. This will be remedied using the same tools as above

In [58]:
##join county and state fips codes
unemp['fips'] = unemp[['State FIPSCode', 'County FIPS Code']].apply(lambda x: ''.join(x), axis=1)
##remove leading zeros from fips codes
unemp['fips'] = [ i.lstrip('0') for i in unemp['fips'] ]

In [59]:
##keep only relevant columns for merge
unemp=unemp.filter(items=['fips','Unemployment Rate'])

In [60]:
##view dataframe
unemp 

Unnamed: 0,fips,Unemployment Rate
0,1001,2.7
1,1003,2.7
2,1005,3.8
3,1007,3.1
4,1009,2.7
...,...,...
3214,72145,9.6
3215,72147,6.9
3216,72149,15.9
3217,72151,13.1


### County characteristics: Social, phyisical, environmental, and clinical factors
The county health rankings and roadmaps program is a collaboration between the Robert Wood Johnson Foundation and the University of Wisconsin Population Health Institute. This program collects data on a range of county-level metrics such as the number of uninsured adults in a county; average traffic volume in a county, and other metrics that are used to assess the overall health of a county. Since this analysis is aimed at exploring the county-level factors that contribute to racial and ethnic disparities in COVID-19 cases, the variables in this vast dataset  can serve as vital features for the statistical learning portion. The variables selected serve as proxies for social-determinants of health that are the root-causes of health disparities.
The variables are: 
Food environment index
Access to exercise opportunities
Income inequality
Violent crime rate


https://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation

In [61]:
##import county ranking data; read FIPS code as object to maintain leading and trailing zeroes
county_rank=pd.read_csv("Data/analytic_data2020_0.csv", dtype={'5-digit FIPS Code': object})

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [62]:
##shape of data
county_rank.shape

(3195, 786)

In [63]:
county_rank=county_rank.filter(items=["5-digit FIPS Code","Traffic volume raw value",
                                      "Severe housing cost burden raw value","Homeownership raw value",
                                      "Residential segregation - non-White/White raw value",
                                      "Food environment index raw value",
                                      "Ratio of population to primary care physicians.",
                                      "Drinking water violations raw value",
                                      "Air pollution - particulate matter raw value","Uninsured adults raw value",
                                     "Access to exercise opportunities raw value","Income inequality raw value",
                                      "Violent crime raw value"])

In [64]:
county_rank

Unnamed: 0,5-digit FIPS Code,Traffic volume raw value,Severe housing cost burden raw value,Homeownership raw value,Residential segregation - non-White/White raw value,Food environment index raw value,Ratio of population to primary care physicians.,Drinking water violations raw value,Air pollution - particulate matter raw value,Uninsured adults raw value,Access to exercise opportunities raw value,Income inequality raw value,Violent crime raw value
0,fipscode,v156_rawvalue,v154_rawvalue,v153_rawvalue,v142_rawvalue,v133_rawvalue,v004_other_data_1,v124_rawvalue,v125_rawvalue,v003_rawvalue,v132_rawvalue,v044_rawvalue,v043_rawvalue
1,00000,,0.1484650658,0.6384759732,46.77346382,7.6,1325.0905296,,8.6,0.1224754716,0.8416869169,4.9200178008,386.46489648
2,01000,166.00847228,0.1265381674,0.6855046233,50.777775905,5.8,1542.6414557,0.1492537313,11,0.1408970991,0.6111228737,5.2611357399,479.91918191
3,01001,88.457040416,0.1340348117,0.7489462467,23.628395199,7.2,2220.16,0,11.7,0.1114898915,0.6913012406,5.2345972691,272.28222006
4,01003,86.997429882,0.1172580795,0.7361934319,31.825343231,8,1371.7935484,0,10.3,0.1434852477,0.7371354895,4.4177666786,203.66039629
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,56037,154.755,0.083856,0.753702,25.3527,7.7,2720.88,1,5.1,0.169401,0.896863,3.97592,300.494
3191,56039,135.189,0.107691,0.577309,29.0447,8.2,861.667,1,4.9,0.175177,0.997182,3.77527,
3192,56041,96.1916,0.0770767,0.752683,11.5779,7.4,2277.22,1,5.9,0.165985,0.840184,4.35006,71.0065
3193,56043,82.8222,0.0681678,0.768264,10.8665,8.3,2016,0,4.8,0.193353,0.831829,3.32746,78.2661


In [65]:
## remove the first row
county_rank=county_rank.iloc[1:]

In [66]:
##reset index
county_rank=county_rank.reset_index(drop=True)

In [67]:
##rename column headers
county_rank=county_rank.rename(columns={'5-digit FIPS Code':'fips',
                                       'Traffic volume raw value':'traffic_vol',
                                        'Severe housing cost burden raw value':'house_burden',
                                        'Homeownership raw value':'ownership',
                                        'Residential segregation - non-White/White raw value':'residential_seg',
                                       'Food environment index raw value':'food_index',
                                        'Ratio of population to primary care physicians.':'pop_to_phys',
                                       'Drinking water violations raw value':'water_violation',
                                        'Air pollution - particulate matter raw value':'air_pollution',
                                       'Access to exercise opportunities raw value':'exercise_opp',
                                        'Income inequality raw value':'income_ineq',
                                        'Violent crime raw value':'crime_violent'})

In [68]:
##removing leading zero from fips column
county_rank['fips'] = [ i.lstrip('0') for i in county_rank['fips'] ]

In [69]:
##view dataframe
county_rank

Unnamed: 0,fips,traffic_vol,house_burden,ownership,residential_seg,food_index,pop_to_phys,water_violation,air_pollution,Uninsured adults raw value,exercise_opp,income_ineq,crime_violent
0,,,0.1484650658,0.6384759732,46.77346382,7.6,1325.0905296,,8.6,0.1224754716,0.8416869169,4.9200178008,386.46489648
1,1000,166.00847228,0.1265381674,0.6855046233,50.777775905,5.8,1542.6414557,0.1492537313,11,0.1408970991,0.6111228737,5.2611357399,479.91918191
2,1001,88.457040416,0.1340348117,0.7489462467,23.628395199,7.2,2220.16,0,11.7,0.1114898915,0.6913012406,5.2345972691,272.28222006
3,1003,86.997429882,0.1172580795,0.7361934319,31.825343231,8,1371.7935484,0,10.3,0.1434852477,0.7371354895,4.4177666786,203.66039629
4,1005,102.29176221,0.1405954631,0.6139777923,23.449712509,5.6,3158.75,0,11.5,0.1608583414,0.5316676986,5.6814100186,414.27786068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,56037,154.755,0.083856,0.753702,25.3527,7.7,2720.88,1,5.1,0.169401,0.896863,3.97592,300.494
3190,56039,135.189,0.107691,0.577309,29.0447,8.2,861.667,1,4.9,0.175177,0.997182,3.77527,
3191,56041,96.1916,0.0770767,0.752683,11.5779,7.4,2277.22,1,5.9,0.165985,0.840184,4.35006,71.0065
3192,56043,82.8222,0.0681678,0.768264,10.8665,8.3,2016,0,4.8,0.193353,0.831829,3.32746,78.2661


## Merging
Now that the data frames have been cleaned and column headers (specifically for fips and county) have been standardized, the data frame will be combined into one with the NYT data set serving as the main/basis data frame.

####  NYT + Population (master)

In [70]:
##master dataframe will be called df; start with merging NYT dataframe and population dataframe
df=pd.merge(left=county_data, right=county_pop, how='left',on=["fips"], indicator=True)

In [71]:
##view left-only merges; 
df.loc[df._merge=="left_only",:].drop(columns= "_merge")

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,WH,BA,AA,IA,HI


All observations in the NYT COVID-19 dataframe have been merged with their corresponding population data in the county population dataframe.

In [72]:
##drop _merge column
df=df.drop(columns= "_merge")

In [73]:
##view dataframe
df.sample(10)

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,WH,BA,AA,IA,HI
848,45063,SC,Lexington County,558,243,98,11,0,0,298750,237431,46912,6602,1593,18794
459,27009,MN,Benton County,158,56,21,0,0,0,40889,37135,2124,468,261,1125
472,27053,MN,Hennepin County,7793,2297,2452,1188,63,216,1265843,939500,175102,95308,14043,88473
317,19113,IA,Linn County,928,536,220,18,0,16,226706,199775,13893,5837,635,7675
352,20111,KS,Lyon County,391,101,0,226,0,0,33195,29936,845,887,476,7231
651,37015,NC,Bertie County,103,0,37,0,0,0,18947,6850,11575,145,137,446
839,45041,SC,Florence County,555,158,315,0,0,0,138293,73681,59687,2227,602,3889
441,26117,MI,Montcalm County,49,17,0,0,0,0,63888,60538,1594,272,399,2314
587,34029,NJ,Ocean County,8359,3969,255,648,0,103,607186,563128,21851,11838,1623,57818
42,1105,AL,Perry County,10,0,10,0,0,0,8923,2698,6063,56,29,144


### Master +GDP

In [74]:
## merge master dataframe from above with GDP data
df = pd.merge(left=df, right=gdp, how='left',on=["fips"], indicator=True)

In [75]:
##view left-only merges; 
df.loc[df._merge=="left_only",:]

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,WH,BA,AA,IA,HI,GDP,_merge
873,51003,VA,Albemarle County,166,76,10,0,0,0,109330,89388,10600,6051,383,6313,,left_only
875,51015,VA,Augusta County,80,53,0,0,0,0,75558,70102,3518,490,191,2374,,left_only
879,51035,VA,Carroll County,70,38,0,11,0,0,29791,28964,263,68,98,1147,,left_only
882,51059,VA,Fairfax County,10209,1308,569,5073,0,722,1147532,742178,121954,231085,6108,189515,,left_only
886,51069,VA,Frederick County,292,121,0,47,0,0,89313,80867,4225,1573,431,8325,,left_only
888,51081,VA,Greensville County,54,0,10,0,0,0,11336,4281,6823,79,32,316,,left_only
891,51089,VA,Henry County,80,11,5,0,0,0,50557,37625,11434,254,180,2926,,left_only
892,51095,VA,James City County,179,113,0,0,0,0,76523,61473,10433,2023,336,4692,,left_only
897,51121,VA,Montgomery County,51,11,0,0,0,0,98535,85207,4238,6245,307,3424,,left_only
901,51143,VA,Pittsylvania County,28,6,0,0,0,0,60354,45990,12970,303,199,1663,,left_only


About 20 counties in VA seem to be missing from the GDP data. Let's query these counties in the GDP using the fips code to see if it is a naming issue or if these observations are not present in the GDP dataframe

In [76]:
## isolate left_only merges
merge_fail_gdp=df.loc[df._merge=="left_only",:]

In [77]:
##turn column of fips to list
merge_fail_gdp=merge_fail_gdp["fips"]. tolist() 
merge_fail_gdp

['51003',
 '51015',
 '51035',
 '51059',
 '51069',
 '51081',
 '51089',
 '51095',
 '51121',
 '51143',
 '51149',
 '51153',
 '51161',
 '51165',
 '51175',
 '51177',
 '51191',
 '51199',
 '51775']

In [78]:
##check if the fips code of observations in the left-only merge are in the GDP data
gdp[gdp['fips'].isin(merge_fail_gdp)]

Unnamed: 0,fips,GDP


Looks like these observations are not in the GDP dataframe. Let's do a spot check to make sure

In [79]:
##fips code that did not merge
merge_fail_gdp

['51003',
 '51015',
 '51035',
 '51059',
 '51069',
 '51081',
 '51089',
 '51095',
 '51121',
 '51143',
 '51149',
 '51153',
 '51161',
 '51165',
 '51175',
 '51177',
 '51191',
 '51199',
 '51775']

In [80]:
##create a list to house random sample of merge failed fip codes
merge_fail_samp=sample(merge_fail_gdp,5)

In [81]:
##view random sample of fips codes
merge_fail_samp

['51165', '51177', '51143', '51175', '51149']

In [82]:
##check if random sample of fips codes are in gdp dataframe
gdp.loc[gdp['fips'].isin(merge_fail_samp)]

Unnamed: 0,fips,GDP


Since these fips codes are not in the GDP data, the _merge column will be dropped and the resot of the dataframe will be merged

In [83]:
df=df.drop(columns= "_merge")

### Master + Poverty & Median Household Income

In [84]:
## merge master dataframe from above with poverty and median household income data
df = pd.merge(left=df, right=pov_inc, how='left',on=["fips"], indicator=True)

In [85]:
##view left-only merges; 
df.loc[df._merge=="left_only",:]

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,WH,BA,AA,IA,HI,GDP,pov_perc,Med_inc,_merge


Merge was successful.

In [86]:
##drop _merge column
df=df.drop(columns= "_merge")

### Master + Unemployment Rate

In [87]:
## merge master dataframe from above with Unemployment data
df=pd.merge(left=df, right=unemp, how='left',on=['fips'], indicator=True)

In [88]:
##view left-only merges; 
df.loc[df._merge=="left_only",:]

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,WH,BA,AA,IA,HI,GDP,pov_perc,Med_inc,Unemployment Rate,_merge


Merge was successful.

In [89]:
##drop _merge column
df=df.drop(columns= "_merge")

### Master + County Rank Metrics

In [90]:
## merge master dataframe from above with county rankings data
df = pd.merge(left=df, right=county_rank, how='left',on=["fips"], indicator=True)

In [91]:
##view left-only merges; 
df.loc[df._merge=="left_only",:]

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,...,residential_seg,food_index,pop_to_phys,water_violation,air_pollution,Uninsured adults raw value,exercise_opp,income_ineq,crime_violent,_merge


Merge was successful.

In [92]:
##drop _merge column
df=df.drop(columns= "_merge")

In [93]:
## view final dataframe
df.sample(20)

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,...,ownership,residential_seg,food_index,pop_to_phys,water_violation,air_pollution,Uninsured adults raw value,exercise_opp,income_ineq,crime_violent
155,12077,FL,Liberty County,191,98,81,0,0,0,8354,...,0.7530449391,32.004902272,6.8,,0,9.3,0.1755190078,0.195218171,4.3771788487,53.683268715
189,13059,GA,Clarke County,110,16,5,5,0,0,128331,...,0.3900559038,24.753940793,5.0,1530.8915663,0,11.0,0.1877373515,0.8226005449,6.3518829982,404.78640757
33,1085,AL,Lowndes County,144,0,105,0,0,0,9726,...,0.7366028708,25.472077125,4.6,10076.0,0,10.9,0.1381935047,0.0486768741,6.4781650131,671.01922379
427,26067,MI,Ionia County,131,62,0,12,0,0,64697,...,0.77089,35.6111,8.4,3383.74,1,10.6,0.0699978,0.745779,3.84236,246.525
166,12097,FL,Osceola County,567,106,28,325,0,0,375751,...,0.611145848,20.65131322,8.0,2445.6944444,1,8.1,0.1998102834,0.7792768484,4.0502995561,425.63477221
660,37045,NC,Cleveland County,62,27,5,0,0,0,97947,...,0.675126,26.9024,7.0,1908.51,1,10.6,0.150142,0.553457,4.96936,188.185
788,42007,PA,Beaver County,565,51,0,0,0,0,163929,...,0.732621,57.4928,7.5,2340.0,1,11.0,0.0565427,0.843531,4.39826,265.152
106,8029,CO,Delta County,27,22,0,0,0,0,31162,...,0.7104022892,30.438304533,8.0,1273.6666667,1,5.3,0.1269114096,0.7325213233,4.5823898046,84.296945152
780,41051,OR,Multnomah County,1056,451,43,272,0,59,812855,...,0.541734,25.3405,7.9,686.113,0,8.4,0.0977241,0.974595,4.96963,474.461
421,26049,MI,Genesee County,1964,660,662,20,0,0,405813,...,0.69506,60.3895,6.6,1157.34,1,10.2,0.0815784,0.857193,4.78067,653.199


### Export complete dataframe to csv file
The data collection and high-level cleaning process is complete. The merged dataframe will be exported as a csv file to be used for visualizations and modeling. 

In [94]:
##export as csv file
df.to_csv("Data/COVID_project_PPOL564.csv", index=False)

**Since there will be multiple modeling attempts with different features, a second csv file with 15 additional variables from the county rankings and roadmaps data set will be created.**

In [95]:
##import county ranking data; read FIPS code as object to maintain leading and trailing zeroes
county_rank=pd.read_csv("Data/analytic_data2020_0.csv", dtype={'5-digit FIPS Code': object})

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [96]:
## select the additional 15 variables 
county_rank=county_rank.filter(items=["5-digit FIPS Code","High school graduation raw value","Some college raw value",
                                     "Social associations raw value","Percentage of households with high housing costs",
                                     "Percentage of households with overcrowding","Percentage of households with lack of kitchen or plumbing facilities",
                                     "Long commute - driving alone raw value","Life expectancy raw value","Premature age-adjusted mortality raw value",
                                     "Frequent physical distress raw value","Frequent mental distress raw value","Diabetes prevalence raw value","Food insecurity raw value",
                                     "Limited access to healthy foods raw value","% Rural raw value"])

In [97]:
## remove the first row
county_rank=county_rank.iloc[1:]

In [98]:
##reset index
county_rank=county_rank.reset_index(drop=True)

In [99]:
##rename column headers
county_rank=county_rank.rename(columns={'5-digit FIPS Code':'fips',
                                        "High school graduation raw value":"high_sch_grad",
                                        "Some college raw value":"some_college",
                                     "Social associations raw value":"soc_association",
                                        "Percentage of households with high housing costs":"perc_high_cost",
                                     "Percentage of households with overcrowding":"perc_overcrowding",
                                        "Percentage of households with lack of kitchen or plumbing facilities":"perc_plumb",
                                     "Long commute - driving alone raw value":"lng_commute",
                                        "Life expectancy raw value":"life_exp",
                                        "Premature age-adjusted mortality raw value":"pre_mortality",
                                     "Frequent physical distress raw value":"phys_distress",
                                        "Frequent mental distress raw value":"ment_distress",
                                        "Diabetes prevalence raw value":"diab_prevalence",
                                        "Food insecurity raw value":"food_insecure",
                                     "Limited access to healthy foods raw value":"food_access",
                                        "% Rural raw value":"perc_rural"})

In [100]:
##removing leading zero from fips column
county_rank['fips'] = [ i.lstrip('0') for i in county_rank['fips'] ]

### Master + County Rank Metrics (additional 15 variables added)

In [101]:
## merge master dataframe from above with county rankings data
df = pd.merge(left=df, right=county_rank, how='left',on=["fips"], indicator=True)

In [102]:
##view left-only merges; 
df.loc[df._merge=="left_only",:]

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,...,lng_commute,life_exp,pre_mortality,phys_distress,ment_distress,diab_prevalence,food_insecure,food_access,perc_rural,_merge


Merge was successful.

In [103]:
##drop _merge column
df=df.drop(columns= "_merge")

In [104]:
## view final dataframe
df.sample(20)

Unnamed: 0,fips,state,county,cases,white_cases,black_cases,hispanic_cases,native_cases,asian_cases,TOT_POP,...,perc_plumb,lng_commute,life_exp,pre_mortality,phys_distress,ment_distress,diab_prevalence,food_insecure,food_access,perc_rural
756,39129,OH,Pickaway County,788,366,252,0,0,0,58457,...,0.00443286,0.491,76.0878,451.442,0.117105,0.125693,0.138,0.122,0.0200877,0.498653
781,41053,OR,Polk County,89,55,0,0,0,0,86085,...,0.0177546,0.342,80.5167,291.442,0.11262,0.134534,0.117,0.122,0.100949,0.199263
647,36123,NY,Yates County,34,11,0,0,0,0,24913,...,0.00944386,0.3,78.2102,321.068,0.109131,0.127513,0.143,0.099,0.00500011,0.712246
67,4019,AZ,Pima County,2044,769,67,788,57,5,1047279,...,0.0111408989,0.349,79.566414097,348.2490477,0.1168049064,0.124380782,0.084,0.136,0.0768111718,0.0752349114
772,41003,OR,Benton County,40,5,0,0,0,0,93053,...,0.0178389,0.187,82.7978,206.214,0.117529,0.135543,0.055,0.146,0.0707513,0.187639
518,28045,MS,Hancock County,62,21,0,0,0,0,47632,...,0.0085197,0.456,76.8785,425.361,0.127614,0.133031,0.126,0.139,0.148234,0.425596
648,37001,NC,Alamance County,283,94,43,65,0,0,169509,...,0.00878253,0.328,77.4638,402.177,0.127895,0.129919,0.126,0.144,0.100207,0.28558
919,53007,WA,Chelan County,179,21,0,92,0,0,77200,...,0.0126838,0.161,81.0909,259.442,0.11525,0.12823,0.065,0.095,0.060438,0.272245
560,28143,MS,Tunica County,38,0,7,0,0,0,9632,...,0.005,0.35,70.5428,690.711,0.163832,0.155601,0.107,0.28,0.0963186,0.660234
185,13037,GA,Calhoun County,45,0,5,0,0,0,6189,...,0.0021621622,0.366,76.936390509,516.41626672,0.1508897936,0.1481468019,0.188,0.224,0.0569390151,1.0


In [105]:
##export second version of df as separate csv file
df.to_csv("Data/COVID_project_PPOL564_v2.csv", index=False)