In [1]:
# Dependencies
import requests
import json
import pandas as pd
from sqlalchemy import create_engine

### Extract JSON into DataFrames

In [2]:
# base url
base_url = "https://data.cdc.gov/api/views/r8kw-7aab/rows.json"

# run a request using our params dictionary
response = requests.get(base_url)

In [3]:
# convert response to json
cdc_data = response.json()

# Print the json (pretty printed)
print(json.dumps(cdc_data, indent=4, sort_keys=True))

{
    "data": [
        [
            "row-tdqu.fken.qxav",
            "00000000-0000-0000-CF48-DD4B3C265EFF",
            0,
            1590502771,
            null,
            1590502771,
            null,
            "{ }",
            "2020-05-26T00:00:00",
            "2020-02-01T00:00:00",
            "2020-02-01T00:00:00",
            "By week",
            "United States",
            "Week-ending",
            "0",
            "57585",
            "0.97",
            "3717",
            "0",
            "475",
            "4192",
            null
        ],
        [
            "row-8ue4.ghqb~866z",
            "00000000-0000-0000-EE19-D7FBF87D6329",
            0,
            1590502771,
            null,
            1590502771,
            null,
            "{ }",
            "2020-05-26T00:00:00",
            "2020-02-08T00:00:00",
            "2020-02-08T00:00:00",
            "By week",
            "United States",
            "Week-ending",
            "1",
         

In [4]:
print(cdc_data["data"][0])

['row-tdqu.fken.qxav', '00000000-0000-0000-CF48-DD4B3C265EFF', 0, 1590502771, None, 1590502771, None, '{ }', '2020-05-26T00:00:00', '2020-02-01T00:00:00', '2020-02-01T00:00:00', 'By week', 'United States', 'Week-ending', '0', '57585', '0.97', '3717', '0', '475', '4192', None]


### Transform CDC Data into DataFrame

In [5]:
# Creating an empty Dataframe with column names only
cdc_data_df = pd.DataFrame(columns=['state','start_week', 'COVID_deaths', 'pneumonia_deaths','pneumonia_and_COVID_deaths',
                                    'influenza_deaths', 'pneumonia_influenzaor_COVID_deaths', 'total_deaths'])

In [6]:
# use iterrows to iterate through pandas dataframe
index = 0
for row in cdc_data["data"]:
    row = row
    try:
        cdc_data_df.loc[index, 'start_week'] = row[9]
        cdc_data_df.loc[index, 'state'] = row[12]
        cdc_data_df.loc[index, 'COVID_deaths'] = row[14]
        cdc_data_df.loc[index, 'pneumonia_deaths'] = row[17]
        cdc_data_df.loc[index, 'pneumonia_and_COVID_deaths'] =row[18]
        cdc_data_df.loc[index, 'influenza_deaths'] = row[19]
        cdc_data_df.loc[index, 'pneumonia_influenzaor_COVID_deaths'] = row[20]
        cdc_data_df.loc[index, 'total_deaths'] = row[15]
        
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")
    index = index +1

In [7]:
cdc_data_df['start_week'] = pd.to_datetime(cdc_data_df['start_week']).dt.strftime('%m/%d/%Y')

In [8]:
# Removed Null values
cdc_data_df = cdc_data_df.fillna(0)
cdc_data_df

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
0,United States,02/01/2020,0,3717,0,475,4192,57585
1,United States,02/08/2020,1,3716,0,508,4225,58248
2,United States,02/15/2020,0,3750,0,541,4291,57590
3,United States,02/22/2020,2,3611,0,554,4167,57642
4,United States,02/29/2020,5,3731,3,629,4362,57956
...,...,...,...,...,...,...,...,...
913,Puerto Rico,04/25/2020,16,60,0,0,73,418
914,Puerto Rico,05/02/2020,13,40,0,0,51,298
915,Puerto Rico,05/09/2020,0,18,0,0,21,129
916,Puerto Rico,05/16/2020,0,0,0,0,0,37


### Cleaning the CDC Data to megre with John Hopkins Data

In [9]:
# deleting row 0 to 16 as they are for united states as a whole
cdc_data_df = cdc_data_df[16:]
cdc_data_df

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
16,United States,05/23/2020,128,106,48,0,186,1523
17,Alabama,02/01/2020,0,56,0,14,70,1019
18,Alabama,02/08/2020,0,61,0,10,71,1107
19,Alabama,02/15/2020,0,75,0,0,81,1080
20,Alabama,02/22/2020,0,67,0,0,71,1081
...,...,...,...,...,...,...,...,...
913,Puerto Rico,04/25/2020,16,60,0,0,73,418
914,Puerto Rico,05/02/2020,13,40,0,0,51,298
915,Puerto Rico,05/09/2020,0,18,0,0,21,129
916,Puerto Rico,05/16/2020,0,0,0,0,0,37


In [10]:
cdc_data_df = cdc_data_df.astype({'COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_deaths':'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_and_COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'influenza_deaths':'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_influenzaor_COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'total_deaths':'int64'})

In [11]:
cdc_data_df.dtypes

state                                 object
start_week                            object
COVID_deaths                           int64
pneumonia_deaths                       int64
pneumonia_and_COVID_deaths             int64
influenza_deaths                       int64
pneumonia_influenzaor_COVID_deaths     int64
total_deaths                           int64
dtype: object

In [12]:
#selecting only data from 04/25/2020 to 05/20/2020
cdc_may_data = cdc_data_df.loc[cdc_data_df["start_week"] >= "04/25/2020"]
cdc_may_data

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
16,United States,05/23/2020,128,106,48,0,186,1523
29,Alabama,04/25/2020,69,69,17,0,122,1012
30,Alabama,05/02/2020,63,72,17,0,119,958
31,Alabama,05/09/2020,61,64,18,0,107,761
32,Alabama,05/16/2020,19,16,0,0,30,327
...,...,...,...,...,...,...,...,...
913,Puerto Rico,04/25/2020,16,60,0,0,73,418
914,Puerto Rico,05/02/2020,13,40,0,0,51,298
915,Puerto Rico,05/09/2020,0,18,0,0,21,129
916,Puerto Rico,05/16/2020,0,0,0,0,0,37


In [13]:
cdc_may_group_data = cdc_may_data.groupby(['state']).sum()
cdc_may_group_data

Unnamed: 0_level_0,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,212,221,52,0,378,3074
Alaska,0,0,0,0,0,178
Arizona,307,372,151,0,529,4381
Arkansas,43,122,0,0,159,2062
California,1388,1845,761,0,2476,18321
Colorado,505,384,242,0,647,3248
Connecticut,99,15,15,0,99,99
Delaware,144,72,54,0,162,526
District of Columbia,119,133,119,0,133,379
Florida,829,1212,420,0,1631,14687


In [14]:
# Save Data to csv
cdc_may_group_data.to_csv("cdc_may_group_data.csv")

### Create database connection

In [15]:
connection_string = "postgres:postgres@localhost:5432/cdc_may_data_db"
engine = create_engine(f'postgresql://{connection_string}')

In [16]:
# Confirm tables
engine.table_names()

['cdc_covid_data',
 'john_hopkins_data',
 'combine_cdc_hopkins_data',
 'nytimes_nursing_home_data']

### Load DataFrames into database

In [17]:
cdc_may_group_data.to_sql(name='cdc_covid_data', con=engine, if_exists='append', index=True)

#### Since we are having issues with backup and restore. Importing directly the CSV file of the team member

In [18]:
john_hopkins_file = "criselda_ETL/raw_data/john_hopkins.csv"
john_hopkins_df = pd.read_csv(john_hopkins_file)
john_hopkins_df.head()

Unnamed: 0,state,Confirmed,Deaths,Recovered,Active,People_Tested,People_Hospitalized
0,Alabama,268365.0,10469.0,0.0,257896.0,3271388.0,33973.0
1,Alaska,11345.0,294.0,8568.0,4545.0,726773.0,261.0
2,American Samoa,0.0,0.0,0.0,0.0,1873.0,0.0
3,Arizona,296916.0,13583.0,66156.0,230418.0,3143680.0,40766.0
4,Arkansas,111560.0,2345.0,67785.0,51013.0,1792689.0,12537.0


In [19]:
john_hopkins_df.to_sql(name='john_hopkins_data', con=engine, if_exists='append', index=True)

In [20]:
combined_data = pd.merge(john_hopkins_df,cdc_may_group_data, on='state')
combined_data.set_index("state", inplace=True)
combined_data

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,People_Tested,People_Hospitalized,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alabama,268365.0,10469.0,0.0,257896.0,3271388.0,33973.0,212,221,52,0,378,3074
Alaska,11345.0,294.0,8568.0,4545.0,726773.0,261.0,0,0,0,0,0,178
Arizona,296916.0,13583.0,66156.0,230418.0,3143680.0,40766.0,307,372,151,0,529,4381
Arkansas,111560.0,2345.0,67785.0,51013.0,1792689.0,12537.0,43,122,0,0,159,2062
California,1833427.0,73742.0,0.0,1759685.0,25160433.0,39006.0,1388,1845,761,0,2476,18321
Colorado,529952.0,27484.0,73634.0,439770.0,2703381.0,93679.0,505,384,242,0,647,3248
Connecticut,940155.0,79582.0,107815.0,752758.0,3589061.0,203891.0,99,15,15,0,99,99
Delaware,171738.0,5875.0,61578.0,112275.0,853420.0,2297.0,144,72,54,0,162,526
District of Columbia,163811.0,8224.0,24496.0,137589.0,789052.0,3216.0,119,133,119,0,133,379
Florida,1158800.0,45876.0,0.0,1112924.0,14713034.0,203058.0,829,1212,420,0,1631,14687


In [21]:
# Save Data to csv
combined_data.to_csv("combined_data.csv")

In [22]:
combined_data.to_sql(name='combine_cdc_hopkins_data', con=engine, if_exists='append', index=True)

### Importing data from NYTimes

In [23]:
nytimes_file = "criselda_ETL/raw_data/nytimes.csv"
nytimes_nursing_df = pd.read_csv(nytimes_file)
nytimes_nursing_df.head()

Unnamed: 0,name,city,state,zip,cases,deaths
0,Crowne Health Care nursing home,Mobile,Alabama,36605,98,13
1,Arbor Springs Health and Rehab Center,Opelika,Alabama,36801,95,15
2,Sapphire Nursing and Rehab,Tucson,Arizona,85714,94,0
3,Briarwood Nursing and Rehabilitation,Little Rock,Arkansas,72205,60,5
4,Walnut Ridge Nursing and Rehab Center,Walnut Ridge,Arkansas,72476,79,4


In [24]:
nytimes_nursing_df = nytimes_nursing_df.rename({'name': 'Nursing_home_Name', 'cases': 'Covid-19_cases', 'deaths': 'Nursing_home_deaths'}, axis=1)
nytimes_nursing_df.set_index("state", inplace=True)
nytimes_nursing_df.head()

Unnamed: 0_level_0,Nursing_home_Name,city,zip,Covid-19_cases,Nursing_home_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,Crowne Health Care nursing home,Mobile,36605,98,13
Alabama,Arbor Springs Health and Rehab Center,Opelika,36801,95,15
Arizona,Sapphire Nursing and Rehab,Tucson,85714,94,0
Arkansas,Briarwood Nursing and Rehabilitation,Little Rock,72205,60,5
Arkansas,Walnut Ridge Nursing and Rehab Center,Walnut Ridge,72476,79,4


In [25]:
nytimes_nursing_df.to_sql(name='nytimes_nursing_home_data', con=engine, if_exists='append', index=True)

### Validate the data

In [26]:
pd.read_sql_query('select * from cdc_covid_data', con=engine).head()

Unnamed: 0,state,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
0,Alabama,153,195,29,0,309,2871
1,Alaska,0,0,0,0,0,159
2,Arizona,258,334,114,0,474,4165
3,Arkansas,26,106,0,0,142,1936
4,California,1099,1615,615,0,2101,16950


In [27]:
pd.read_sql_query('select * from john_hopkins_data', con=engine).head()

Unnamed: 0,index,state,Confirmed,Deaths,Recovered,Active,People_Tested,People_Hospitalized
0,0,Alabama,268365.0,10469.0,0.0,257896.0,3271388.0,33973.0
1,1,Alaska,11345.0,294.0,8568.0,4545.0,726773.0,261.0
2,2,American Samoa,0.0,0.0,0.0,0.0,1873.0,0.0
3,3,Arizona,296916.0,13583.0,66156.0,230418.0,3143680.0,40766.0
4,4,Arkansas,111560.0,2345.0,67785.0,51013.0,1792689.0,12537.0


In [28]:
pd.read_sql_query('select * from combine_cdc_hopkins_data', con=engine).head()

Unnamed: 0,state,Confirmed,Deaths,Recovered,Active,People_Tested,People_Hospitalized,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
0,Alabama,268365.0,10469.0,0.0,257896.0,3271388.0,33973.0,190,210,48,0,350,2990
1,Alaska,11345.0,294.0,8568.0,4545.0,726773.0,261.0,0,0,0,0,0,159
2,Arizona,296916.0,13583.0,66156.0,230418.0,3143680.0,40766.0,258,334,114,0,474,4165
3,Arkansas,111560.0,2345.0,67785.0,51013.0,1792689.0,12537.0,43,122,0,0,159,2037
4,California,1833427.0,73742.0,0.0,1759685.0,25160433.0,39006.0,1179,1680,661,0,2200,17288


In [29]:
pd.read_sql_query('select * from nytimes_nursing_home_data', con=engine).head()

Unnamed: 0,index,Nursing_home_Name,city,state,zip,Covid-19_cases,Nursing_home_deaths
0,0.0,Crowne Health Care nursing home,Mobile,Alabama,36605,98,13
1,1.0,Arbor Springs Health and Rehab Center,Opelika,Alabama,36801,95,15
2,2.0,Sapphire Nursing and Rehab,Tucson,Arizona,85714,94,0
3,3.0,Briarwood Nursing and Rehabilitation,Little Rock,Arkansas,72205,60,5
4,4.0,Walnut Ridge Nursing and Rehab Center,Walnut Ridge,Arkansas,72476,79,4
