In [1]:
# Dependencies
import requests
import json
import pandas as pd
from sqlalchemy import create_engine

### Extract JSON into DataFrames

In [2]:
# base url
base_url = "https://data.cdc.gov/api/views/r8kw-7aab/rows.json"

# run a request using our params dictionary
response = requests.get(base_url)

In [3]:
# convert response to json
cdc_data = response.json()

# Print the json (pretty printed)
print(json.dumps(cdc_data, indent=4, sort_keys=True))

{
    "data": [
        [
            "row-v8c6-4u8p~xnb3",
            "00000000-0000-0000-2684-A445700CEB78",
            0,
            1590071875,
            null,
            1590071875,
            null,
            "{ }",
            "2020-05-21T00:00:00",
            "2020-02-01T00:00:00",
            "2020-02-01T00:00:00",
            "By week",
            "United States",
            "Week-ending",
            "0",
            "57581",
            "0.97",
            "3711",
            "0",
            "475",
            "4186",
            null
        ],
        [
            "row-ubnx_reve_ashn",
            "00000000-0000-0000-E579-8E4D0ABB6F6F",
            0,
            1590071875,
            null,
            1590071875,
            null,
            "{ }",
            "2020-05-21T00:00:00",
            "2020-02-08T00:00:00",
            "2020-02-08T00:00:00",
            "By week",
            "United States",
            "Week-ending",
            "1",
         

In [4]:
print(cdc_data["data"][0])

['row-v8c6-4u8p~xnb3', '00000000-0000-0000-2684-A445700CEB78', 0, 1590071875, None, 1590071875, None, '{ }', '2020-05-21T00:00:00', '2020-02-01T00:00:00', '2020-02-01T00:00:00', 'By week', 'United States', 'Week-ending', '0', '57581', '0.97', '3711', '0', '475', '4186', None]


### Transform CDC Data into DataFrame

In [5]:
# Creating an empty Dataframe with column names only
cdc_data_df = pd.DataFrame(columns=['state','start_week', 'COVID_deaths', 'pneumonia_deaths','pneumonia_and_COVID_deaths',
                                    'influenza_deaths', 'pneumonia_influenzaor_COVID_deaths', 'total_deaths'])

In [6]:
# use iterrows to iterate through pandas dataframe
index = 0
for row in cdc_data["data"]:
    row = row
    try:
        cdc_data_df.loc[index, 'start_week'] = row[9]
        cdc_data_df.loc[index, 'state'] = row[12]
        cdc_data_df.loc[index, 'COVID_deaths'] = row[14]
        cdc_data_df.loc[index, 'pneumonia_deaths'] = row[17]
        cdc_data_df.loc[index, 'pneumonia_and_COVID_deaths'] =row[18]
        cdc_data_df.loc[index, 'influenza_deaths'] = row[19]
        cdc_data_df.loc[index, 'pneumonia_influenzaor_COVID_deaths'] = row[20]
        cdc_data_df.loc[index, 'total_deaths'] = row[15]
        
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")
    index = index +1

In [7]:
cdc_data_df['start_week'] = pd.to_datetime(cdc_data_df['start_week']).dt.strftime('%m/%d/%Y')

In [8]:
# Set index
#cdc_data_df.set_index("state", inplace=True)
cdc_data_df = cdc_data_df.fillna(0)
cdc_data_df

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
0,United States,02/01/2020,0,3711,0,475,4186,57581
1,United States,02/08/2020,1,3714,0,507,4222,58243
2,United States,02/15/2020,0,3746,0,540,4286,57579
3,United States,02/22/2020,2,3610,0,552,4164,57634
4,United States,02/29/2020,6,3727,4,630,4359,57952
...,...,...,...,...,...,...,...,...
859,Puerto Rico,04/18/2020,18,56,0,0,68,439
860,Puerto Rico,04/25/2020,17,53,0,0,65,370
861,Puerto Rico,05/02/2020,13,30,0,0,41,222
862,Puerto Rico,05/09/2020,0,15,0,0,16,90


### Cleaning the CDC Data to megre with John Hopkins Data

In [9]:
# deleting row 0 to 16 as they are for united states as a whole
cdc_data_df = cdc_data_df[16:]
cdc_data_df

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
16,Alabama,02/01/2020,0,55,0,14,69,1019
17,Alabama,02/08/2020,0,61,0,10,71,1107
18,Alabama,02/15/2020,0,75,0,0,81,1079
19,Alabama,02/22/2020,0,67,0,0,71,1080
20,Alabama,02/29/2020,0,63,0,14,77,1143
...,...,...,...,...,...,...,...,...
859,Puerto Rico,04/18/2020,18,56,0,0,68,439
860,Puerto Rico,04/25/2020,17,53,0,0,65,370
861,Puerto Rico,05/02/2020,13,30,0,0,41,222
862,Puerto Rico,05/09/2020,0,15,0,0,16,90


In [10]:
cdc_data_df = cdc_data_df.astype({'COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_deaths':'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_and_COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'influenza_deaths':'int64'})
cdc_data_df = cdc_data_df.astype({'pneumonia_influenzaor_COVID_deaths': 'int64'})
cdc_data_df = cdc_data_df.astype({'total_deaths':'int64'})

In [11]:
cdc_data_df.dtypes

state                                 object
start_week                            object
COVID_deaths                           int64
pneumonia_deaths                       int64
pneumonia_and_COVID_deaths             int64
influenza_deaths                       int64
pneumonia_influenzaor_COVID_deaths     int64
total_deaths                           int64
dtype: object

In [12]:
#selecting only data from 04/25/2020 to 05/20/2020
cdc_may_data = cdc_data_df.loc[cdc_data_df["start_week"] >= "04/25/2020"]
cdc_may_data

Unnamed: 0,state,start_week,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
28,Alabama,04/25/2020,61,67,16,0,113,984
29,Alabama,05/02/2020,50,67,13,0,105,923
30,Alabama,05/09/2020,31,50,0,0,72,682
31,Alabama,05/16/2020,11,11,0,0,19,282
44,Alaska,04/25/2020,0,0,0,0,0,48
...,...,...,...,...,...,...,...,...
847,Wyoming,05/16/2020,0,0,0,0,0,35
860,Puerto Rico,04/25/2020,17,53,0,0,65,370
861,Puerto Rico,05/02/2020,13,30,0,0,41,222
862,Puerto Rico,05/09/2020,0,15,0,0,16,90


In [17]:
cdc_may_group_data = cdc_may_data.groupby(['state']).sum()
cdc_may_group_data

Unnamed: 0_level_0,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,153,195,29,0,309,2871
Alaska,0,0,0,0,0,159
Arizona,258,334,114,0,474,4165
Arkansas,26,106,0,0,142,1936
California,1099,1615,615,0,2101,16950
Colorado,464,363,219,0,599,3071
Connecticut,45,0,0,0,45,45
Delaware,120,54,39,0,137,479
District of Columbia,100,114,100,0,114,356
Florida,723,1134,353,0,1507,14111


In [18]:
# Save Data to csv
cdc_may_group_data.to_csv("cdc_may_group_data.csv")

# Visualize to confirm cdc data appears
cdc_may_group_data.head(20)

Unnamed: 0_level_0,COVID_deaths,pneumonia_deaths,pneumonia_and_COVID_deaths,influenza_deaths,pneumonia_influenzaor_COVID_deaths,total_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,153,195,29,0,309,2871
Alaska,0,0,0,0,0,159
Arizona,258,334,114,0,474,4165
Arkansas,26,106,0,0,142,1936
California,1099,1615,615,0,2101,16950
Colorado,464,363,219,0,599,3071
Connecticut,45,0,0,0,45,45
Delaware,120,54,39,0,137,479
District of Columbia,100,114,100,0,114,356
Florida,723,1134,353,0,1507,14111


### Create database connection

In [14]:
connection_string = "postgres:postgres@localhost:5432/cdc_may_data_db"
engine = create_engine(f'postgresql://{connection_string}')

In [15]:
# Confirm tables
engine.table_names()

['cdc_covid_data']

### Load DataFrames into database

In [16]:
cdc_may_group_data.to_sql(name='cdc_covid_data', con=engine, if_exists='append', index=True)