In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,state_name,SD,LD,NESC,PC,GS,CPV,start_date,cases_y,infection_rate,deaths_y,death_rate
0,California,0,0,0,0,0,0,1/26/20,1.025641,0.126244,0.0,0.0
1,California,1,0,0,0,0,0,3/5/20,17.166667,0.24494,0.333333,0.003505
2,California,1,0,0,0,1,0,3/11/20,38.0,0.263889,0.0,0.0
3,California,1,0,0,0,3,0,3/12/20,46.0,0.252747,1.0,0.005495
4,California,1,0,0,1,4,0,3/13/20,63.0,0.247435,0.5,0.001845


### Additional Data Source
- Aging
- Popularity density
- [Number of Airports](https://www.globalair.com/airport/state.aspx)
- [Area](https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area)
- [Education Index](http://www.usa.com/rank/us--average-education-index--state-rank.htm)
- [Medical]()

In [3]:
df_aux = pd.read_csv('aux_data.csv').drop(columns = ['Unnamed: 0'])
df_aux.head()

Unnamed: 0,state_name,aging,pop_density_sqkm,n_airports,area_sqkm,education_index
0,California,0.143,95.0,492,423967,13.44
1,Colorado,0.142,19.6,265,269601,14.14
2,District of Columbia,0.122,4088.4,3,177,14.93
3,Florida,0.205,140.8,482,170312,13.42
4,Georgia,0.138,67.1,328,153910,13.41


In [4]:
df_aux['airport_density_sqkm'] = df_aux['n_airports'] / df_aux['area_sqkm']
df_aux.drop(columns=['n_airports', 'area_sqkm'], inplace=True)
df_aux.head()

Unnamed: 0,state_name,aging,pop_density_sqkm,education_index,airport_density_sqkm
0,California,0.143,95.0,13.44,0.00116
1,Colorado,0.142,19.6,14.14,0.000983
2,District of Columbia,0.122,4088.4,14.93,0.016949
3,Florida,0.205,140.8,13.42,0.00283
4,Georgia,0.138,67.1,13.41,0.002131


In [5]:
df = df.merge(df_aux, left_on='state_name', right_on='state_name')
df

Unnamed: 0,state_name,SD,LD,NESC,PC,GS,CPV,start_date,cases_y,infection_rate,deaths_y,death_rate,aging,pop_density_sqkm,education_index,airport_density_sqkm
0,California,0,0,0,0,0,0,1/26/20,1.025641,0.126244,0.000000,0.000000,0.143,95.0,13.44,0.001160
1,California,1,0,0,0,0,0,3/5/20,17.166667,0.244940,0.333333,0.003505,0.143,95.0,13.44,0.001160
2,California,1,0,0,0,1,0,3/11/20,38.000000,0.263889,0.000000,0.000000,0.143,95.0,13.44,0.001160
3,California,1,0,0,0,3,0,3/12/20,46.000000,0.252747,1.000000,0.005495,0.143,95.0,13.44,0.001160
4,California,1,0,0,1,4,0,3/13/20,63.000000,0.247435,0.500000,0.001845,0.143,95.0,13.44,0.001160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Washington,0,0,0,0,0,0,1/21/20,0.414634,0.111007,0.146341,0.021889,0.154,40.5,13.94,0.001922
61,Washington,0,0,0,1,0,0,3/2/20,17.000000,0.567148,1.666667,0.086420,0.154,40.5,13.94,0.001922
62,Washington,1,0,0,1,0,0,3/5/20,39.166667,0.287014,2.833333,0.023776,0.154,40.5,13.94,0.001922
63,Washington,1,0,0,1,3,0,3/11/20,76.800000,0.178568,3.800000,0.008671,0.154,40.5,13.94,0.001922


In [6]:
df_medical = pd.read_csv('medical_data.csv')
df_medical

Unnamed: 0,state_name,avg_principle_cost,surgical_quality
0,California,1097181,31.2401
1,District of Columbia,16630,0.578
2,Florida,854244,20.2232
3,Georgia,1852784,7.6079
4,Illinois,1389450,6.6204
5,Kansas,774205,3.0284
6,Massachusetts,232123,2.1411
7,Michigan,1269577,4.9486
8,Nevada,161156,2.2824
9,New Jersey,360964,11.1371


In [7]:
df = df.merge(df_medical, left_on='state_name', right_on='state_name')
df

Unnamed: 0,state_name,SD,LD,NESC,PC,GS,CPV,start_date,cases_y,infection_rate,deaths_y,death_rate,aging,pop_density_sqkm,education_index,airport_density_sqkm,avg_principle_cost,surgical_quality
0,California,0,0,0,0,0,0,1/26/20,1.025641,0.126244,0.000000,0.000000,0.143,95.0,13.44,0.001160,1097181,31.2401
1,California,1,0,0,0,0,0,3/5/20,17.166667,0.244940,0.333333,0.003505,0.143,95.0,13.44,0.001160,1097181,31.2401
2,California,1,0,0,0,1,0,3/11/20,38.000000,0.263889,0.000000,0.000000,0.143,95.0,13.44,0.001160,1097181,31.2401
3,California,1,0,0,0,3,0,3/12/20,46.000000,0.252747,1.000000,0.005495,0.143,95.0,13.44,0.001160,1097181,31.2401
4,California,1,0,0,1,4,0,3/13/20,63.000000,0.247435,0.500000,0.001845,0.143,95.0,13.44,0.001160,1097181,31.2401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Washington,0,0,0,0,0,0,1/21/20,0.414634,0.111007,0.146341,0.021889,0.154,40.5,13.94,0.001922,591656,5.6037
57,Washington,0,0,0,1,0,0,3/2/20,17.000000,0.567148,1.666667,0.086420,0.154,40.5,13.94,0.001922,591656,5.6037
58,Washington,1,0,0,1,0,0,3/5/20,39.166667,0.287014,2.833333,0.023776,0.154,40.5,13.94,0.001922,591656,5.6037
59,Washington,1,0,0,1,3,0,3/11/20,76.800000,0.178568,3.800000,0.008671,0.154,40.5,13.94,0.001922,591656,5.6037


In [8]:
for name in df.columns:
    print(name)

state_name
SD
LD
NESC
PC
GS
CPV
start_date
cases_y
infection_rate
deaths_y
death_rate
aging
pop_density_sqkm
education_index
airport_density_sqkm
avg_principle_cost
surgical_quality


In [9]:
df.to_csv('model_data_final.csv')