In [1]:
# import following packages first
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
%pwd

'/Users/meghnadiwan/Downloads/Hackathon/covid-hackathon'

### Import John Hopkin's Dataset

In [3]:
from os import listdir
from os.path import join

# Source path for John Hopkin's dauly reports
source_path = '/Users/meghnadiwan/Downloads/Hackathon/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'
files = [f for f in listdir(source_path) if f.endswith('.csv')]

In [4]:
df = pd.concat([pd.read_csv(join(source_path, f_name)) for f_name in files], sort = False)
#subset to US only
df = df[df['Country/Region'] == "US"]
df.shape

(1560, 17)

In [5]:
# keep on columns needed
cols = ['Province/State', 'Country/Region', 'Last Update', 'Confirmed',
       'Deaths', 'Recovered','Latitude', 'Longitude']
df = df[cols]

In [6]:
# convert date to datetime
df['Last Update'] = pd.to_datetime(df['Last Update'])

In [7]:
# remove (From Diamond Princess)
df['Province/State'] = df['Province/State'].str.replace("(From Diamond Princess)", "")
df['Province/State'] = df['Province/State'].str.strip(" ()")

In [8]:
#Split Province/State into Province and State
split = df['Province/State'].str.split(',', expand=True)
df['Province'] = split[0].str.strip()
df['State'] = split[1].str.strip()

In [9]:
# Replace missing State values using dictionary below
st_dict = {'Unassigned Location': 'Unassigned Location', 'U.S.': 'Unassigned Location',
              'Washington':'WA', 'New York':'NY', 'California':'CA',
              'Massachusetts':'MA','Georgia':'GA','Colorado':'CO', 
              'Florida':'FL', 'New Jersey':'NJ', 'Oregon':'OR', 
              'Texas':'TX', 'Illinois':'IL', 'Pennsylvania':'PA', 
              'Iowa':'IA', 'Maryland':'MD', 'North Carolina':'NC',
              'South Carolina':'SC', 'Tennessee':'TN', 'Virginia':'VA', 
              'Arizona':'AZ', 'Indiana':'IN','Kentucky':'KY', 'D.C.':'DC',
              'District of Columbia':'DC', 'Nevada':'NV', 'New Hampshire':'NH',
              'Minnesota':'MN', 'Nebraska':'NE', 'Ohio':'OH', 'Rhode Island':'RI', 
              'Wisconsin':'WI','Connecticut':'CT', 'Hawaii':'HI', 'Oklahoma':'OK', 
              'Utah':'UT', 'Kansas':'KS', 'Louisiana':'LA','Missouri':'MO', 
              'Vermont':'VT', 'Alaska':'AK', 'Arkansas':'AR', 'Delaware':'DE', 
              'Idaho':'ID','Maine':'ME', 'Michigan':'MI', 'Mississippi':'MS', 
              'Montana':'MT', 'New Mexico':'NM','North Dakota':'ND', 'South Dakota':'SD', 
              'West Virginia':'WV', 'Wyoming':'WY','Alabama':'AL', 'Puerto Rico':'PR', 
              'Guam': 'GU','Virgin Islands':'VI', 'United States Virgin Islands': 'VI', 'Chicago':'IL'}
df['State'] = np.where(df['State'] == 'U.S.', 'Unassigned Location', df['State'])
df['State'] = np.where(df['State'] == 'D.C.', 'DC', df['State'])
df['State'] = df['State'].fillna(df.Province.map(st_dict))

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1560 entries, 36 to 254
Data columns (total 10 columns):
Province/State    1560 non-null object
Country/Region    1560 non-null object
Last Update       1560 non-null datetime64[ns]
Confirmed         1560 non-null float64
Deaths            1530 non-null float64
Recovered         1530 non-null float64
Latitude          1186 non-null float64
Longitude         1186 non-null float64
Province          1560 non-null object
State             1529 non-null object
dtypes: datetime64[ns](1), float64(5), object(4)
memory usage: 134.1+ KB


In [11]:
df.State.unique()

array(['Unassigned Location', 'IL', 'CA', 'MA', 'WI', 'TX', 'WA', 'AZ',
       'NE', 'NY', nan, 'GA', 'CO', 'FL', 'NJ', 'OR', 'PA', 'IA', 'MD',
       'NC', 'SC', 'TN', 'VA', 'IN', 'KY', 'DC', 'NV', 'NH', 'MN', 'OH',
       'RI', 'CT', 'HI', 'OK', 'UT', 'KS', 'LA', 'MO', 'VT', 'AK', 'AR',
       'DE', 'ID', 'ME', 'MI', 'MS', 'MT', 'NM', 'ND', 'SD', 'WV', 'WY',
       'AL', 'PR', 'GU', 'VI'], dtype=object)

### Create State Health Measure Columns 

In [12]:
df['Free Treatment'] = 0
df['Early RX Refills'] = 0
df['SEP'] = 0
df['Waiver 1135'] = 0
df['Paid Sick Leave'] = 0

In [13]:
# No cost for treatment
df['Free Treatment'] = np.where((df['Last Update'] >= '2020-03-12') & (df.State == 'D.C.'), 1, df['Free Treatment'])
df['Free Treatment'] = np.where((df['Last Update'] >= '2020-03-06') & (df.State == 'MA'), 1, df['Free Treatment'])
df['Free Treatment'] = np.where((df['Last Update'] >= '2020-03-13') & (df.State == 'NM'), 1, df['Free Treatment'])

In [14]:
df['Free Treatment'].value_counts()

0    1520
1      40
Name: Free Treatment, dtype: int64

In [15]:
# Early Prescription Refills
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-05') & (df.State == 'WA'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-18') & (df.State == 'CA'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-03') & (df.State == 'AK'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-09') & (df.State == 'CO'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-17') & (df.State == 'LA'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-20') & (df.State == 'GA'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-10') & (df.State == 'FL'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-09') & (df.State == 'KY'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-13') & (df.State == 'WV'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-12') & (df.State == 'ME'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-10') & (df.State == 'NH'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'MA'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-13') & (df.State == 'RI'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-09') & (df.State == 'DE'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-06') & (df.State == 'MD'), 1, df['Early RX Refills'])
df['Early RX Refills'] = np.where((df['Last Update'] >= '2020-03-12') & (df.State == 'DC'), 1, df['Early RX Refills'])

In [16]:
df['Early RX Refills'].value_counts()

0    1386
1     174
Name: Early RX Refills, dtype: int64

In [17]:
# SEP
df['SEP'] = np.where((df['Last Update'] >= '2020-02-18') & (df.State == 'CA'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-19') & (df.State == 'CO'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-17') & (df.State == 'CT'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-02-25') & (df.State == 'D.C.'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-13') & (df.State == 'MD'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-11') & (df.State == 'MA'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-17') & (df.State == 'NV'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-16') & (df.State == 'NY'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'RI'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-20') & (df.State == 'VT'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-10') & (df.State == 'WA'), 1, df['SEP'])
df['SEP'] = np.where((df['Last Update'] >= '2020-03-20') & (df.State == 'MN'), 1, df['SEP'])

In [18]:
df['SEP'].value_counts()

0    1337
1     223
Name: SEP, dtype: int64

In [19]:
# Section 1135 waiver
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'AL'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'AZ'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'CA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'CO'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-27') & (df.State == 'CT'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-27') & (df.State == 'DE'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-16') & (df.State == 'FL'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'HI'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'ID'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'IL'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'IN'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'IA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'KS'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'KY'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'LA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'MD'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'MA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-27') & (df.State == 'MN'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'MS'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'MO'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'NH'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'NJ'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'NM'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-26') & (df.State == 'NY'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'NC'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-24') & (df.State == 'ND'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-24') & (df.State == 'OK'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'OR'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-27') & (df.State == 'PA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-25') & (df.State == 'RI'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-24') & (df.State == 'SD'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-23') & (df.State == 'VA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-15') & (df.State == 'WA'), 1, df['Waiver 1135'])
df['Waiver 1135'] = np.where((df['Last Update'] >= '2020-03-27') & (df.State == 'WY'), 1, df['Waiver 1135'])

In [20]:
df['Waiver 1135'].value_counts()

0    1547
1      13
Name: Waiver 1135, dtype: int64

In [21]:
# Paid sick leave
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-01-01') & (df.State == 'WA'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2016-01-01') & (df.State == 'OR'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-07-01') & (df.State == 'CA'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2017-07-01') & (df.State == 'AZ'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2020-03-10') & (df.State == 'CO'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2020-03-17') & (df.State == 'NY'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-01-01') & (df.State == 'VT'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2015-07-01') & (df.State == 'MA'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-07-01') & (df.State == 'RI'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2012-01-01') & (df.State == 'CT'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-10-01') & (df.State == 'NJ'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2018-02-01') & (df.State == 'MD'), 1, df['Paid Sick Leave'])
df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2014-01-01') & (df.State == 'D.C.'), 1, df['Paid Sick Leave'])

#df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2019-03-01') & (df.State == 'MI'), 1, df['Paid Sick Leave'])
#df['Paid Sick Leave'] = np.where((df['Last Update'] >= '2020-01-01') & (df.State == 'NV'), 1, df['Paid Sick Leave'])

In [22]:
df['Paid Sick Leave'].value_counts()

0    887
1    673
Name: Paid Sick Leave, dtype: int64

In [23]:
#Extract Date from Last Update
df['Date'] = df['Last Update'].dt.date

In [24]:
df.shape

(1560, 16)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1560 entries, 36 to 254
Data columns (total 16 columns):
Province/State      1560 non-null object
Country/Region      1560 non-null object
Last Update         1560 non-null datetime64[ns]
Confirmed           1560 non-null float64
Deaths              1530 non-null float64
Recovered           1530 non-null float64
Latitude            1186 non-null float64
Longitude           1186 non-null float64
Province            1560 non-null object
State               1529 non-null object
Free Treatment      1560 non-null int64
Early RX Refills    1560 non-null int64
SEP                 1560 non-null int64
Waiver 1135         1560 non-null int64
Paid Sick Leave     1560 non-null int64
Date                1560 non-null object
dtypes: datetime64[ns](1), float64(5), int64(5), object(5)
memory usage: 207.2+ KB


### Read Luis' portion containing state measures and at-risk adults

In [26]:
us_measures = pd.read_csv('us_states_daily_measures_and_effects_fixed.csv', encoding='latin-1')
us_measures

Unnamed: 0,date,state,stay_at_home,non_essential_business_closure,large_gathering_ban,school_closure,bar_restaurant_limits,primary_election_postponed,emergency_declaration,adult_under_60,adult_over_60,risk_under_60,risk_over_60,hospital_beds,beds_per_thousand,total_chc,total_chc_delivery_sites
0,2020-01-22,Alabama,0,0,999,0,0,0,0,2620083,1190177,566450,1190180,15278,3.13,15,144
1,2020-01-23,Alabama,0,0,999,0,0,0,0,2620083,1190177,566450,1190180,15278,3.13,15,144
2,2020-01-24,Alabama,0,0,999,0,0,0,0,2620083,1190177,566450,1190180,15278,3.13,15,144
3,2020-01-25,Alabama,0,0,999,0,0,0,0,2620083,1190177,566450,1190180,15278,3.13,15,144
4,2020-01-26,Alabama,0,0,999,0,0,0,0,2620083,1190177,566450,1190180,15278,3.13,15,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3310,2020-03-22,Wyoming,0,0,10,1,1,0,1,308190,133680,45840,133690,2015,3.49,6,14
3311,2020-03-23,Wyoming,0,0,10,1,1,0,1,308190,133680,45840,133690,2015,3.49,6,14
3312,2020-03-24,Wyoming,0,0,10,1,1,0,1,308190,133680,45840,133690,2015,3.49,6,14
3313,2020-03-25,Wyoming,0,0,10,1,1,0,1,308190,133680,45840,133690,2015,3.49,6,14


In [27]:
# Create State abbreviation columns to merge on 

st_abb = {'Alabama':'AL', 'Alaska':'AK', 'Arizona':'AZ', 'Arkansas':'AR', 'California':'CA',
       'Colorado':'CO', 'Connecticut':'CT', 'Delaware':'DE', 'District of Columbia':'DC', 
       'Florida':'FL', 'Georgia':'GA', 'New York':'NY', 'Minnesota':'MN',
       'Hawaii':'HI', 'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN', 'Iowa':'IA', 'Kansas':'KS',
       'Kentucky':'KY', 'Louisiana':'LA', 'Maine':'ME', 'Maryland':'MD', 'Massachusetts':'MA',
       'Michigan':'MI', 'Mississippi':'MS', 'Missouri':'MO', 'Montana':'MT', 'Nebraska':'NE',
       'Nevada':'NV', 'New Hampshire':'NH', 'New Jersey':'NJ', 'New Mexico':'NM',
       'North Carolina':'NC', 'North Dakota':'ND', 'Ohio':'OH', 'Oklahoma':'OK', 'Oregon':'OR',
       'Pennsylvania':'PA', 'Rhode Island':'RI', 'South Carolina':'SC', 'South Dakota':'SD',
       'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT', 'Vermont':'VT', 'Virginia':'VA', 'Washington':'WA',
       'West Virginia':'WV', 'Wisconsin':'WI', 'Wyoming':'WY'}
us_measures['State'] = us_measures.state.map(st_abb)

In [28]:
# Create a Date string column to merge
us_measures.date = pd.to_datetime(us_measures.date)
us_measures['Date'] = us_measures.date.dt.date
us_measures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3315 entries, 0 to 3314
Data columns (total 19 columns):
date                              3315 non-null datetime64[ns]
state                             3315 non-null object
stay_at_home                      3315 non-null int64
non_essential_business_closure    3315 non-null int64
large_gathering_ban               3315 non-null int64
school_closure                    3315 non-null int64
bar_restaurant_limits             3315 non-null int64
primary_election_postponed        3315 non-null int64
emergency_declaration             3315 non-null int64
adult_under_60                    3315 non-null object
adult_over_60                     3315 non-null object
risk_under_60                     3315 non-null object
risk_over_60                      3315 non-null object
hospital_beds                     3315 non-null object
beds_per_thousand                 3315 non-null float64
total_chc                         3315 non-null int64
total_chc_de

### Merge Datasets

In [29]:
df_merged = df.merge(us_measures, on=['Date', 'State'], how = 'left')
df_merged.shape

(1560, 33)

In [30]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1560 entries, 0 to 1559
Data columns (total 33 columns):
Province/State                    1560 non-null object
Country/Region                    1560 non-null object
Last Update                       1560 non-null datetime64[ns]
Confirmed                         1560 non-null float64
Deaths                            1530 non-null float64
Recovered                         1530 non-null float64
Latitude                          1186 non-null float64
Longitude                         1186 non-null float64
Province                          1560 non-null object
State                             1529 non-null object
Free Treatment                    1560 non-null int64
Early RX Refills                  1560 non-null int64
SEP                               1560 non-null int64
Waiver 1135                       1560 non-null int64
Paid Sick Leave                   1560 non-null int64
Date                              1560 non-null object
date 

### Subset to Final Columns and Rows

In [31]:
cols = ['Date', 'State', 'state', 'Province/State', 'Country/Region', 'Last Update', 'Confirmed',
       'Deaths', 'Recovered', 'Latitude', 'Longitude', 
       'Free Treatment', 'Early RX Refills', 'SEP', 'Waiver 1135',
       'Paid Sick Leave', 'stay_at_home','non_essential_business_closure', 'large_gathering_ban',
       'school_closure', 'bar_restaurant_limits', 'primary_election_postponed',
       'emergency_declaration', 'adult_under_60', 'adult_over_60',
       'risk_under_60', 'risk_over_60', 'hospital_beds', 'beds_per_thousand',
       'total_chc', 'total_chc_delivery_sites']
final = df_merged[cols]
final.rename(columns={'State':'State_abb','state':'State'}, inplace=True)
final = final[~final.State_abb.isin(['PR', 'GU', 'VI'])] # delete rows with territories 
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1537 entries, 0 to 1559
Data columns (total 31 columns):
Date                              1537 non-null object
State_abb                         1506 non-null object
State                             1489 non-null object
Province/State                    1537 non-null object
Country/Region                    1537 non-null object
Last Update                       1537 non-null datetime64[ns]
Confirmed                         1537 non-null float64
Deaths                            1507 non-null float64
Recovered                         1507 non-null float64
Latitude                          1163 non-null float64
Longitude                         1163 non-null float64
Free Treatment                    1537 non-null int64
Early RX Refills                  1537 non-null int64
SEP                               1537 non-null int64
Waiver 1135                       1537 non-null int64
Paid Sick Leave                   1537 non-null int64
stay_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [32]:
final

Unnamed: 0,Date,State_abb,State,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Free Treatment,Early RX Refills,SEP,Waiver 1135,Paid Sick Leave,stay_at_home,non_essential_business_closure,large_gathering_ban,school_closure,bar_restaurant_limits,primary_election_postponed,emergency_declaration,adult_under_60,adult_over_60,risk_under_60,risk_over_60,hospital_beds,beds_per_thousand,total_chc,total_chc_delivery_sites
0,2020-02-26,Unassigned Location,,Unassigned Location,US,2020-02-26 20:03:06,42.0,0.0,0.0,,,0,0,0,0,0,,,,,,,,,,,,,,,
1,2020-02-09,IL,Illinois,"Chicago, IL",US,2020-02-09 19:03:03,2.0,0.0,2.0,,,0,0,0,0,0,0.0,0.0,999.0,0.0,0.0,0.0,0.0,7044811,2865849,1064160,2865850,32066,2.52,44.0,402
2,2020-02-03,CA,California,"San Benito, CA",US,2020-02-03 03:53:02,2.0,0.0,0.0,,,0,0,0,0,1,0.0,0.0,999.0,0.0,0.0,0.0,0.0,22523811,8282499,3122640,8282500,72511,1.84,177.0,1740
3,2020-02-21,CA,California,"San Diego County, CA",US,2020-02-21 05:43:02,2.0,0.0,1.0,,,0,0,1,0,1,0.0,0.0,999.0,0.0,0.0,0.0,0.0,22523811,8282499,3122640,8282500,72511,1.84,177.0,1740
4,2020-02-21,CA,California,"Santa Clara, CA",US,2020-02-21 05:23:04,2.0,0.0,1.0,,,0,0,1,0,1,0.0,0.0,999.0,0.0,0.0,0.0,0.0,22523811,8282499,3122640,8282500,72511,1.84,177.0,1740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,2020-03-06,CA,California,"Yolo County, CA",US,2020-03-06 20:13:14,1.0,0.0,0.0,38.7646,-121.9018,0,0,1,0,1,0.0,0.0,999.0,0.0,0.0,0.0,0.0,22523811,8282499,3122640,8282500,72511,1.84,177.0,1740
1556,2020-02-24,TX,Texas,"Lackland, TX",US,2020-02-24 23:33:02,0.0,0.0,0.0,29.3829,-98.6134,0,0,0,0,0,0.0,0.0,999.0,0.0,0.0,0.0,0.0,16091113,5384517,2443450,5384520,65671,2.29,72.0,537
1557,2020-03-07,TX,Texas,"Montgomery County, TX",US,2020-03-07 19:53:02,0.0,0.0,0.0,30.3213,-95.4778,0,0,0,0,0,0.0,0.0,999.0,0.0,0.0,0.0,0.0,16091113,5384517,2443450,5384520,65671,2.29,72.0,537
1558,2020-02-24,NE,Nebraska,"Omaha, NE",US,2020-02-24 23:33:02,0.0,0.0,0.0,41.2545,-95.9758,0,0,0,0,0,0.0,0.0,999.0,0.0,0.0,0.0,0.0,1025529,435081,148520,435080,6842,3.55,7.0,45


In [33]:
# Fill missing values

final.State_abb = final.State_abb.fillna("Unassigned Location")
final.State = final.State.fillna("Unassigned Location")
final['stay_at_home'] = final['stay_at_home'].fillna(0)
final['non_essential_business_closure'] = final['non_essential_business_closure'].fillna(0)
final['large_gathering_ban'] = final['large_gathering_ban'].fillna(999)
final['school_closure'] = final['school_closure'].fillna(0)
final['bar_restaurant_limits'] = final['bar_restaurant_limits'].fillna(0)
final['primary_election_postponed'] = final['primary_election_postponed'].fillna(0)
final['emergency_declaration'] = final['emergency_declaration'].fillna(0)

In [34]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1537 entries, 0 to 1559
Data columns (total 31 columns):
Date                              1537 non-null object
State_abb                         1537 non-null object
State                             1537 non-null object
Province/State                    1537 non-null object
Country/Region                    1537 non-null object
Last Update                       1537 non-null datetime64[ns]
Confirmed                         1537 non-null float64
Deaths                            1507 non-null float64
Recovered                         1507 non-null float64
Latitude                          1163 non-null float64
Longitude                         1163 non-null float64
Free Treatment                    1537 non-null int64
Early RX Refills                  1537 non-null int64
SEP                               1537 non-null int64
Waiver 1135                       1537 non-null int64
Paid Sick Leave                   1537 non-null int64
stay_

In [35]:
final.State.value_counts()

California              305
Washington               92
Texas                    70
Massachusetts            69
Arizona                  63
Illinois                 58
Unassigned Location      48
Florida                  46
Wisconsin                46
New York                 42
Oregon                   40
Nebraska                 34
Georgia                  27
Colorado                 26
North Carolina           23
Pennsylvania             22
New Hampshire            22
Nevada                   21
Rhode Island             21
Tennessee                21
New Jersey               19
South Carolina           19
Indiana                  18
Kentucky                 18
Maryland                 18
Minnesota                17
Oklahoma                 15
Utah                     15
Connecticut              15
Hawaii                   15
Missouri                 14
District of Columbia     14
Vermont                  14
Kansas                   14
Virginia                 14
Louisiana           

In [36]:
final.State_abb.value_counts()

CA                     305
WA                      92
TX                      70
MA                      69
AZ                      63
IL                      58
Unassigned Location     48
WI                      46
FL                      46
NY                      42
OR                      40
NE                      34
GA                      27
CO                      26
NC                      23
NH                      22
PA                      22
NV                      21
TN                      21
RI                      21
SC                      19
NJ                      19
MD                      18
KY                      18
IN                      18
MN                      17
CT                      15
UT                      15
OK                      15
HI                      15
VT                      14
MO                      14
VA                      14
DC                      14
KS                      14
LA                      13
IA                      13
M

In [38]:
#Output dataset
final.to_csv('us_covid19_w_measures_and_risk_pop.csv')