# Data Extraction

## 1. COVID19 Global Forecasting (Week 4)

Source: https://www.kaggle.com/c/covid19-global-forecasting-week-4/overview

In [35]:
import pandas as pd
 
# Import data

corona = pd.read_csv('train.csv')

In [36]:
corona.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,27019,Alabama,US,1/22/2020,0,0
1,27020,Alabama,US,1/23/2020,0,0
2,27021,Alabama,US,1/24/2020,0,0
3,27022,Alabama,US,1/25/2020,0,0
4,27023,Alabama,US,1/26/2020,0,0


## 2. United States by Density 2020

Source: https://worldpopulationreview.com/states/state-densities/

In [37]:
import requests
from bs4 import BeautifulSoup as bs

url = "https://worldpopulationreview.com/states/state-densities/"

res = requests.get(url)
soup = bs(res.content, 'html.parser')

In [38]:
tables = soup.find_all('table')
len(tables) # only one table exists

1

In [39]:
# Import table
header = ['state', 'density', 'pop_2020', 'area_mi']
tbl_rows = tables[0].tbody.find_all('tr')
df_rows = [[td.getText().strip() for td in tr.find_all('td')] for tr in tbl_rows]
pops = pd.DataFrame(df_rows, columns=header)
print(pops)

                   state density    pop_2020  area_mi
0   District of Columbia  11,815     720,687       61
1             New Jersey   1,215   8,936,574    7,354
2           Rhode Island   1,021   1,056,161    1,034
3          Massachusetts     894   6,976,597    7,800
4            Connecticut     736   3,563,077    4,842
5               Maryland     627   6,083,116    9,707
6               Delaware     504     982,895    1,949
7               New York     413  19,440,469   47,126
8                Florida     410  21,992,985   53,625
9                   Ohio     288  11,747,694   40,861
10          Pennsylvania     287  12,820,878   44,743
11            California     256  39,937,489  155,779
12              Illinois     228  12,659,682   55,519
13                Hawaii     220   1,412,687    6,423
14              Virginia     218   8,626,207   39,490
15        North Carolina     218  10,611,862   48,618
16               Indiana     188   6,745,354   35,826
17               Georgia    

In [40]:
# Cleanse formatted string data into numeric data
n_row = len(pops['state'] )
for i in range(0, n_row):
    pops['density'][i] = pops['density'][i].replace(',','')
    pops['pop_2020'][i] = pops['pop_2020'][i].replace(',','')
    pops['area_mi'][i] = pops['area_mi'][i].replace(',','')

for col in ['density', 'pop_2020', 'area_mi']:
    pops[col] = pops[col].astype(float)
pops.head()

Unnamed: 0,state,density,pop_2020,area_mi
0,District of Columbia,11815.0,720687.0,61.0
1,New Jersey,1215.0,8936574.0,7354.0
2,Rhode Island,1021.0,1056161.0,1034.0
3,Massachusetts,894.0,6976597.0,7800.0
4,Connecticut,736.0,3563077.0,4842.0


# 3. US Census Data 2018 (Economic and Demographic Characteristics)

Source: https://www.census.gov/acs/www/data/data-tables-and-tools/data-profiles/

In [41]:
demo_2018 = pd.read_csv('demo_2018.csv')
demo_2018.head()

Unnamed: 0,GEO_ID,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,...,DP05_0029M,DP05_0029PE,DP05_0029PM,DP05_0030E,DP05_0030M,DP05_0030PE,DP05_0030PM,DP05_0031E,DP05_0031M,DP05_0031PE
0,0400000US01,Alabama,0.1,76.8,0.1,(X),(X),4864680,*****,4864680,...,626,783832,(X),340401,450,43.4,0.1,443431,395,56.6
1,0400000US02,Alaska,0.2,98.6,0.7,(X),(X),738516,*****,738516,...,317,78428,(X),38937,185,49.6,0.2,39491,226,50.4
2,0400000US04,Arizona,0.1,85.5,0.1,(X),(X),6946685,*****,6946685,...,305,1158320,(X),533914,210,46.1,0.1,624406,192,53.9
3,0400000US05,Arkansas,0.1,79.8,0.2,(X),(X),2990671,*****,2990671,...,716,487536,(X),216404,410,44.4,0.1,271132,540,55.6
4,0400000US06,California,0.1,79.7,0.1,(X),(X),39148760,*****,39148760,...,600,5315457,(X),2357611,429,44.4,0.1,2957846,433,55.6


In [42]:
eco_2018 = pd.read_csv('eco_2018.csv')
eco_2018.head()

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0135PE,DP03_0135PM,DP03_0136E,DP03_0136M,DP03_0136PE,DP03_0136PM,DP03_0137E,DP03_0137M,DP03_0137PE,DP03_0137PM
0,0400000US02,Alaska,571528,513,571528,(X),396353,1893,69.3,0.3,...,5.4,0.5,(X),(X),8.6,0.5,(X),(X),18.9,0.8
1,0400000US15,Hawaii,1147445,656,1147445,(X),749527,2913,65.3,0.3,...,8.0,0.4,(X),(X),6.9,0.3,(X),(X),24.3,0.6
2,0400000US10,Delaware,769793,600,769793,(X),482623,2729,62.7,0.4,...,6.6,0.5,(X),(X),9.2,0.4,(X),(X),23.6,0.8
3,0400000US13,Georgia,8082220,2829,8082220,(X),5091132,11115,63.0,0.1,...,10.3,0.2,(X),(X),13.7,0.2,(X),(X),26.9,0.3
4,0400000US11,District of Columbia,573275,457,573275,(X),401740,2377,70.1,0.4,...,14.5,1.0,(X),(X),14.8,0.7,(X),(X),20.1,0.7


# 4. Merge Tables

In [43]:
# Merge corona case with pops
train = pd.merge(corona, pops,  
                 how='left', 
                 left_on=['Province_State'], 
                 right_on = ['state'])
train.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,state,density,pop_2020,area_mi
0,27019,Alabama,US,1/22/2020,0,0,Alabama,97.0,4908621.0,50645.0
1,27020,Alabama,US,1/23/2020,0,0,Alabama,97.0,4908621.0,50645.0
2,27021,Alabama,US,1/24/2020,0,0,Alabama,97.0,4908621.0,50645.0
3,27022,Alabama,US,1/25/2020,0,0,Alabama,97.0,4908621.0,50645.0
4,27023,Alabama,US,1/26/2020,0,0,Alabama,97.0,4908621.0,50645.0


In [44]:
# Merge corona case with eco_2018

train = pd.merge(train, eco_2018,
                 how='left',
                 left_on=['Province_State'],
                 right_on=['NAME'])
train.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,state,density,pop_2020,area_mi,...,DP03_0135PE,DP03_0135PM,DP03_0136E,DP03_0136M,DP03_0136PE,DP03_0136PM,DP03_0137E,DP03_0137M,DP03_0137PE,DP03_0137PM
0,27019,Alabama,US,1/22/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,10.3,0.2,(X),(X),14.6,0.3,(X),(X),30.9,0.4
1,27020,Alabama,US,1/23/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,10.3,0.2,(X),(X),14.6,0.3,(X),(X),30.9,0.4
2,27021,Alabama,US,1/24/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,10.3,0.2,(X),(X),14.6,0.3,(X),(X),30.9,0.4
3,27022,Alabama,US,1/25/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,10.3,0.2,(X),(X),14.6,0.3,(X),(X),30.9,0.4
4,27023,Alabama,US,1/26/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,10.3,0.2,(X),(X),14.6,0.3,(X),(X),30.9,0.4


In [45]:
# Merge corona case with demo_2018

train = pd.merge(train, demo_2018,
                 how='left',
                 left_on=['Province_State'],
                 right_on=['NAME'])
train.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,state,density,pop_2020,area_mi,...,DP05_0029M,DP05_0029PE,DP05_0029PM,DP05_0030E,DP05_0030M,DP05_0030PE,DP05_0030PM,DP05_0031E,DP05_0031M,DP05_0031PE
0,27019,Alabama,US,1/22/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,626,783832.0,(X),340401.0,450,43.4,0.1,443431.0,395,56.6
1,27020,Alabama,US,1/23/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,626,783832.0,(X),340401.0,450,43.4,0.1,443431.0,395,56.6
2,27021,Alabama,US,1/24/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,626,783832.0,(X),340401.0,450,43.4,0.1,443431.0,395,56.6
3,27022,Alabama,US,1/25/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,626,783832.0,(X),340401.0,450,43.4,0.1,443431.0,395,56.6
4,27023,Alabama,US,1/26/2020,0,0,Alabama,97.0,4908621.0,50645.0,...,626,783832.0,(X),340401.0,450,43.4,0.1,443431.0,395,56.6


In [47]:
train.to_csv('train_final.csv')
