In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

url_corona_case = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'

corona_case = pd.read_csv(url_corona_case)
corona_case.head(6)

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0
5,2020-01-25,Orange,California,6059.0,1,0


## b. US Zipcodes to County State to FIPS Crosswalk

Kaggle data source: https://www.kaggle.com/danofer/zipcodes-county-fips-crosswalk

In [7]:
code_mapping = pd.read_csv('ZIP-COUNTY-FIPS_2017-06.csv')
code_mapping.head(6)

Unnamed: 0,ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP
0,36003,Autauga County,AL,1001,H1
1,36006,Autauga County,AL,1001,H1
2,36067,Autauga County,AL,1001,H1
3,36066,Autauga County,AL,1001,H1
4,36703,Autauga County,AL,1001,H1
5,36701,Autauga County,AL,1001,H1


## c. US Population by County

United State Census Bureau: https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html#content  

In [8]:
us_pop = pd.read_csv('co-est2019-alldata.csv')
us_pop.head(6)


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57376,...,1.020788,0.208812,-1.650165,-0.347225,-2.04959,-1.338525,-1.391062,6.193562,-0.069229,1.124597


# 2. Data Manipulation

In [9]:
# a. Subset corona_case
corona_case = corona_case[['date', 'county', 'state', 'cases', 'deaths']]

In [10]:
# b. Convert state to abbrev for corona_case

def state_to_abbrev(state):
    us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
    }
    assert state in us_state_abbrev.keys()
    return us_state_abbrev[state]

In [11]:
n_row = len(corona_case['state'] )
for i in range(0, n_row):
    corona_case['state'][i] = state_to_abbrev(corona_case['state'][i])

AssertionError: 

In [12]:
corona_case['state'].unique()

array(['WA', 'IL', 'CA', 'AZ', 'MA', 'WI', 'TX', 'NE', 'UT', 'OR', 'FL',
       'NY', 'RI', 'GA', 'NH', 'NC', 'NJ', 'CO', 'MD', 'NV', 'TN', 'HI',
       'IN', 'KY', 'MN', 'OK', 'PA', 'SC', 'DC', 'KS', 'MO', 'VT', 'VA',
       'CT', 'IA', 'LA', 'OH', 'MI', 'SD', 'AR', 'DE', 'MS', 'NM', 'ND',
       'WY', 'AK', 'ME', 'AL', 'ID', 'MT', 'PR', 'VI', 'GU', 'WV', 'MP'],
      dtype=object)

In [13]:
corona_case.head()

Unnamed: 0,date,county,state,cases,deaths
0,2020-01-21,Snohomish,WA,1,0
1,2020-01-22,Snohomish,WA,1,0
2,2020-01-23,Snohomish,WA,1,0
3,2020-01-24,Cook,IL,1,0
4,2020-01-24,Snohomish,WA,1,0


In [14]:
# c. Remove State total
us_pop = us_pop[us_pop['COUNTY']!=0]

In [15]:
# d. Select 2019 estimated population
us_pop = us_pop[['STNAME', 'CTYNAME', 'POPESTIMATE2019']]
us_pop.reset_index(inplace=True)

In [16]:
def cleanse_county(county):
    return county.replace('County', '')

In [17]:
n_row = len(us_pop)
for i in range(0, n_row):
    us_pop['CTYNAME'][i] = cleanse_county(us_pop['CTYNAME'][i]).strip()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# e. Convert state to abbrev for us_pop

for i in range(0, n_row):
    us_pop['STNAME'][i] = state_to_abbrev(us_pop['STNAME'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [19]:
# f. Merge case and population

corona_case_pop = pd.merge(corona_case, us_pop,  how='left', left_on=['county','state'], right_on = ['CTYNAME','STNAME'])

corona_case_pop = corona_case_pop[['date', 'county', 'state',  'POPESTIMATE2019', 'cases', 'deaths']]

corona_case_pop.head()

Unnamed: 0,date,county,state,POPESTIMATE2019,cases,deaths
0,2020-01-21,Snohomish,WA,822083.0,1,0
1,2020-01-22,Snohomish,WA,822083.0,1,0
2,2020-01-23,Snohomish,WA,822083.0,1,0
3,2020-01-24,Cook,IL,5150233.0,1,0
4,2020-01-24,Snohomish,WA,822083.0,1,0


In [20]:
# g. Summarise corona_case_pop and select top 20 Counties

grouped_corona = corona_case_pop.groupby(['county'])

In [21]:
top_20 = grouped_corona.sum().sort_values(by=['cases', 'deaths'], ascending=False).head(20)
print(top_20)

# It appears that Bronx, kings, New York County should be combined into New York City before joining with corona case





               POPESTIMATE2019    cases  deaths
county                                         
New York City              0.0  2014472  109132
Suffolk            127002496.0   473948    9813
Nassau              66052129.0   441014   16168
Westchester         45472782.0   399504    8448
Cook               448432941.0   266404    8388
Wayne               88994859.0   214053   12022
Middlesex          112617424.0   201428    6353
Essex               64845392.0   186422    9065
Bergen              43813494.0   178635    7925
Los Angeles        853324095.0   171350    5496
Unknown                    0.0   156424    8355
Rockland            14660505.0   145201    2761
Orange             350310096.0   138190    2475
Jefferson          118300991.0   132931    5114
Miami-Dade         108677600.0   126608    1805
Hudson              28240422.0   126523    3868
Union               38880369.0   115225    3495
Philadelphia        64946624.0   108322    2579
Orleans              1819503.0   107033 