# Target Manipulation:  COVID-19 County Data, Confirmed Cases/Mortality

## Exploration & Cleaning

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import requests
import sqlite3
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn_pandas import DataFrameMapper
import plotly.express as px

In [3]:
conn=sqlite3.connect('COVID19_county_data.db')
cursor= conn.cursor()

In [4]:
#Create function to query SQL data
def query_data(sql_statement):
    df=pd.read_sql(sql_statement, conn)
    #cursor.execute(sql_statement)
    return df.to_dict('records')

In [5]:
#Function to get COVID-19 confirmed cases & mortality data for US
def get_data(data_set='confirmed'):
    if data_set=='confirmed':
        url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"
        df=pd.read_csv(url)
        return [df, 'confirmed']
    elif data_set=='mortality':
        url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
        df=pd.read_csv(url)
        return [df, 'mortality']

In [6]:
#Inspect and remove NaN values from confirmed cases data

In [7]:
confirmed_df=get_data('confirmed')[0]

In [8]:
confirmed_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20,5/20/20,5/21/20,5/22/20,5/23/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0.0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,152,154,154,154,154,154,154,165,165,165.0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,19,19,21,21,21,21,21,22,22,22.0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,2427,2542,2589,2646,2710,2805,2866,2913,3030,3100.0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,69,69,69,69,69,69,69,69,69,69.0


In [9]:
confirmed_df.isna().sum().sort_values(ascending=False)[:5]

FIPS       10
Admin2      7
5/23/20     0
2/22/20     0
2/28/20     0
dtype: int64

In [10]:
confirmed_df[confirmed_df['FIPS'].isna()]

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20,5/20/20,5/21/20,5/22/20,5/23/20
3251,84070002,US,USA,840,,Dukes and Nantucket,Massachusetts,US,41.406747,-70.687635,...,36,36,37,38,38,38,38,39,39,39.0
3252,84070003,US,USA,840,,Kansas City,Missouri,US,39.0997,-94.5786,...,838,848,863,895,901,902,924,927,967,991.0
3253,84070004,US,USA,840,,Michigan Department of Corrections (MDOC),Michigan,US,0.0,0.0,...,2171,2227,2227,2538,3051,3122,3195,3257,3275,3289.0
3254,84070005,US,USA,840,,Federal Correctional Institution (FCI),Michigan,US,0.0,0.0,...,115,115,116,117,118,118,122,125,125,129.0
3255,84070015,US,USA,840,,Bear River,Utah,US,41.521068,-113.083282,...,83,84,87,88,90,89,93,97,98,102.0
3256,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,...,29,29,29,30,30,30,31,32,34,33.0
3257,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,...,13,13,14,14,15,16,16,17,18,18.0
3258,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,...,187,200,209,219,236,245,260,271,281,294.0
3259,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,...,16,16,19,19,22,19,20,20,21,20.0
3260,84070020,US,USA,840,,Weber-Morgan,Utah,US,41.27116,-111.914512,...,214,216,221,226,234,234,242,251,252,256.0


In [11]:
confirmed_df[confirmed_df['Admin2'].isna()]

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20,5/20/20,5/21/20,5/22/20,5/23/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0.0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,152,154,154,154,154,154,154,165,165,165.0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,19,19,21,21,21,21,21,22,22,22.0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,2427,2542,2589,2646,2710,2805,2866,2913,3030,3100.0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,69,69,69,69,69,69,69,69,69,69.0
3198,84088888,US,USA,840,88888.0,,Diamond Princess,US,0.0,0.0,...,49,49,49,49,49,49,49,49,49,49.0
3250,84099999,US,USA,840,99999.0,,Grand Princess,US,0.0,0.0,...,103,103,103,103,103,103,103,103,103,103.0


In [12]:
confirmed_df=confirmed_df.dropna()

In [13]:
confirmed_df.isna().sum().sort_values(ascending=False)[:5]

5/23/20    0
2/29/20    0
2/15/20    0
2/16/20    0
2/17/20    0
dtype: int64

In [14]:
#Inspect and remove NaN values from mortality data

In [15]:
mortality_df=get_data('mortality')[0]

In [16]:
mortality_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20,5/20/20,5/21/20,5/22/20,5/23/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0.0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,5,5,5,5,5,5,5,5,5,5.0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,2,2,2,2,2,2,2,2,2,2.0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,117,122,122,123,124,124,125,126,126,127.0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,6,6,6,6,6,6,6,6,6,6.0


In [17]:
mortality_df.isna().sum().sort_values(ascending=False)[:5]

FIPS       10
Admin2      7
5/23/20     0
2/21/20     0
2/27/20     0
dtype: int64

In [18]:
mortality_df=mortality_df.dropna()

In [19]:
#Function to primarily reorganize date columns into one column while preserving/renaming other categorical columns
def reorg_date_df(df):
    #non_date_attrs = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population']
    non_date_attrs=[col for col in df.columns if "/20" not in col]
    new_records=[]
    records=df.to_dict('records')
    for record in records:
        county_state=record['Combined_Key']
        state=record['Province_State']
        county=record['Admin2']
        fips=record['FIPS']
        if 'Population' in non_date_attrs:
            population= record['Population']
            new_records+=[{'Date': key, 'Count': value, 'County': county, 'State': state,'County, State':county_state, 'FIPS': int(fips), 'Population': population} for key, value in record.items() if key not in non_date_attrs]
        else:
            new_records+=[{'Date': key, 'Count': value, 'County': county, 'State': state,'County, State':county_state, 'FIPS': int(fips)} for key, value in record.items() if key not in non_date_attrs]
            
    return pd.DataFrame(new_records)

In [20]:
clean_confirmed_df=reorg_date_df(confirmed_df)

In [21]:
clean_confirmed_df.head()

Unnamed: 0,Count,County,"County, State",Date,FIPS,State
0,0.0,Autauga,"Autauga, Alabama, US",1/22/20,1001,Alabama
1,0.0,Autauga,"Autauga, Alabama, US",1/23/20,1001,Alabama
2,0.0,Autauga,"Autauga, Alabama, US",1/24/20,1001,Alabama
3,0.0,Autauga,"Autauga, Alabama, US",1/25/20,1001,Alabama
4,0.0,Autauga,"Autauga, Alabama, US",1/26/20,1001,Alabama


In [22]:
clean_confirmed_df['Date']=clean_confirmed_df['Date'].apply(lambda x: pd.to_datetime(x))

In [23]:
clean_confirmed_df.head()

Unnamed: 0,Count,County,"County, State",Date,FIPS,State
0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22,1001,Alabama
1,0.0,Autauga,"Autauga, Alabama, US",2020-01-23,1001,Alabama
2,0.0,Autauga,"Autauga, Alabama, US",2020-01-24,1001,Alabama
3,0.0,Autauga,"Autauga, Alabama, US",2020-01-25,1001,Alabama
4,0.0,Autauga,"Autauga, Alabama, US",2020-01-26,1001,Alabama


In [24]:
clean_confirmed_df.tail()

Unnamed: 0,Count,County,"County, State",Date,FIPS,State
399007,0.0,Unassigned,"Unassigned, Wyoming, US",2020-05-19,90056,Wyoming
399008,0.0,Unassigned,"Unassigned, Wyoming, US",2020-05-20,90056,Wyoming
399009,0.0,Unassigned,"Unassigned, Wyoming, US",2020-05-21,90056,Wyoming
399010,0.0,Unassigned,"Unassigned, Wyoming, US",2020-05-22,90056,Wyoming
399011,0.0,Unassigned,"Unassigned, Wyoming, US",2020-05-23,90056,Wyoming


In [37]:
len(clean_confirmed_df[clean_confirmed_df['County']=='Unassigned'])

6273

In [44]:
#Remove 'Unassigned' Counties
clean_confirmed_df[~(clean_confirmed_df['County'].isin(['Unassigned']))].tail()

Unnamed: 0,Count,County,"County, State",Date,FIPS,State
392734,0.0,Out of WY,"Out of WY, Wyoming, US",2020-05-19,80056,Wyoming
392735,0.0,Out of WY,"Out of WY, Wyoming, US",2020-05-20,80056,Wyoming
392736,0.0,Out of WY,"Out of WY, Wyoming, US",2020-05-21,80056,Wyoming
392737,0.0,Out of WY,"Out of WY, Wyoming, US",2020-05-22,80056,Wyoming
392738,0.0,Out of WY,"Out of WY, Wyoming, US",2020-05-23,80056,Wyoming


In [47]:
'Out of' in 'Out of WY'

True

In [52]:
def remove_out_counties(df=clean_confirmed_df):
    cleaned_list=[]
    for record in df.to_dict('records'):
        if 'Out of' not in record['County']:
            if record['County']!='Unassigned':
                cleaned_list.append(record)
    return pd.DataFrame(cleaned_list)

In [56]:
clean_confirmed_df=remove_out_counties(df=clean_confirmed_df)

In [57]:
clean_confirmed_df.tail()

Unnamed: 0,Count,County,"County, State",Date,FIPS,State
386461,0.0,Weston,"Weston, Wyoming, US",2020-05-19,56045,Wyoming
386462,0.0,Weston,"Weston, Wyoming, US",2020-05-20,56045,Wyoming
386463,0.0,Weston,"Weston, Wyoming, US",2020-05-21,56045,Wyoming
386464,0.0,Weston,"Weston, Wyoming, US",2020-05-22,56045,Wyoming
386465,0.0,Weston,"Weston, Wyoming, US",2020-05-23,56045,Wyoming


In [62]:
clean_confirmed_df.dtypes

Count                   float64
County                   object
County, State            object
Date             datetime64[ns]
FIPS                      int64
State                    object
dtype: object

In [58]:
clean_confirmed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386466 entries, 0 to 386465
Data columns (total 6 columns):
Count            386466 non-null float64
County           386466 non-null object
County, State    386466 non-null object
Date             386466 non-null datetime64[ns]
FIPS             386466 non-null int64
State            386466 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 17.7+ MB


In [59]:
clean_mortality_df=reorg_date_df(mortality_df)

In [60]:
clean_mortality_df['Date']=clean_mortality_df['Date'].apply(lambda x: pd.to_datetime(x))

In [30]:
clean_mortality_df.head()

Unnamed: 0,Count,County,"County, State",Date,FIPS,Population,State
0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22,1001,55869,Alabama
1,0.0,Autauga,"Autauga, Alabama, US",2020-01-23,1001,55869,Alabama
2,0.0,Autauga,"Autauga, Alabama, US",2020-01-24,1001,55869,Alabama
3,0.0,Autauga,"Autauga, Alabama, US",2020-01-25,1001,55869,Alabama
4,0.0,Autauga,"Autauga, Alabama, US",2020-01-26,1001,55869,Alabama


In [31]:
clean_mortality_df.tail()

Unnamed: 0,Count,County,"County, State",Date,FIPS,Population,State
399007,9.0,Unassigned,"Unassigned, Wyoming, US",2020-05-19,90056,0,Wyoming
399008,10.0,Unassigned,"Unassigned, Wyoming, US",2020-05-20,90056,0,Wyoming
399009,11.0,Unassigned,"Unassigned, Wyoming, US",2020-05-21,90056,0,Wyoming
399010,11.0,Unassigned,"Unassigned, Wyoming, US",2020-05-22,90056,0,Wyoming
399011,11.0,Unassigned,"Unassigned, Wyoming, US",2020-05-23,90056,0,Wyoming


In [63]:
clean_mortality_df=remove_out_counties(df=clean_mortality_df)

In [64]:
clean_mortality_df.tail()

Unnamed: 0,Count,County,"County, State",Date,FIPS,Population,State
386461,0.0,Weston,"Weston, Wyoming, US",2020-05-19,56045,6927,Wyoming
386462,0.0,Weston,"Weston, Wyoming, US",2020-05-20,56045,6927,Wyoming
386463,0.0,Weston,"Weston, Wyoming, US",2020-05-21,56045,6927,Wyoming
386464,0.0,Weston,"Weston, Wyoming, US",2020-05-22,56045,6927,Wyoming
386465,0.0,Weston,"Weston, Wyoming, US",2020-05-23,56045,6927,Wyoming


In [66]:
clean_mortality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386466 entries, 0 to 386465
Data columns (total 7 columns):
Count            386466 non-null float64
County           386466 non-null object
County, State    386466 non-null object
Date             386466 non-null datetime64[ns]
FIPS             386466 non-null int64
Population       386466 non-null int64
State            386466 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 20.6+ MB


In [67]:
#Function to get Log10(counts) and replace all 0 values with 0 (for plotting display)
def log_val_df(df, transform_col='Count'):
    vals=df[transform_col]
    log_counts=[]
    for val in vals:
        if val==0:
            log_counts.append(0)
        else:
            log_counts.append(np.log10(val))
    df[f'{transform_col}_log']=log_counts
    return df

In [68]:
clean_confirmed_df=log_val_df(clean_confirmed_df, transform_col='Count')

In [69]:
clean_confirmed_df.head() 

Unnamed: 0,Count,County,"County, State",Date,FIPS,State,Count_log
0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22,1001,Alabama,0.0
1,0.0,Autauga,"Autauga, Alabama, US",2020-01-23,1001,Alabama,0.0
2,0.0,Autauga,"Autauga, Alabama, US",2020-01-24,1001,Alabama,0.0
3,0.0,Autauga,"Autauga, Alabama, US",2020-01-25,1001,Alabama,0.0
4,0.0,Autauga,"Autauga, Alabama, US",2020-01-26,1001,Alabama,0.0


In [70]:
FIPS_df=clean_confirmed_df[['FIPS', 'State', 'County']].drop_duplicates()
FIPS_df.head()

Unnamed: 0,FIPS,State,County
0,1001,Alabama,Autauga
123,1003,Alabama,Baldwin
246,1005,Alabama,Barbour
369,1007,Alabama,Bibb
492,1009,Alabama,Blount


In [71]:
FIPS_records=FIPS_df.to_dict('records')

In [72]:
fips_county_state= [f"{record['County'], record['State']}" for record in FIPS_records]

In [73]:
FIPS_df['County_State']=fips_county_state

In [74]:
FIPS_df.to_sql('fips_codes', conn, index_label='id', if_exists='replace')

In [75]:
clean_mortality_df=log_val_df(clean_mortality_df, transform_col='Count')

In [80]:
clean_mortality_df.head()

Unnamed: 0,Count,County,"County, State",Date,FIPS,Population,State,Count_log
0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22,1001,55869,Alabama,0.0
1,0.0,Autauga,"Autauga, Alabama, US",2020-01-23,1001,55869,Alabama,0.0
2,0.0,Autauga,"Autauga, Alabama, US",2020-01-24,1001,55869,Alabama,0.0
3,0.0,Autauga,"Autauga, Alabama, US",2020-01-25,1001,55869,Alabama,0.0
4,0.0,Autauga,"Autauga, Alabama, US",2020-01-26,1001,55869,Alabama,0.0


In [76]:
clean_confirmed_df.to_sql('confirmed_covid', conn, index_label='id', if_exists='replace')


The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores.



In [77]:
clean_mortality_df.to_sql('mortality_covid', conn, index_label='id', if_exists='replace')

In [121]:
confirmed_mortality_df=pd.DataFrame(query_data('''SELECT confirmed_covid.State, confirmed_covid.County, confirmed_covid."County, State", confirmed_covid.FIPS, 
                                    confirmed_covid.Date,  confirmed_covid.Count as Confirmed_Count , confirmed_covid.Count_log as Confirmed_Count_Log,
                                    mortality_covid.Count as Mortality_Count, mortality_covid.Count_log as Mortality_Count_Log, mortality_covid.Population
                                    FROM confirmed_covid
                                    JOIN mortality_covid ON mortality_covid.FIPS= confirmed_covid.FIPS AND mortality_covid.Date= confirmed_covid.Date
                                    '''))

In [122]:
confirmed_mortality_df[confirmed_mortality_df.County=='New York'].tail()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State
228652,193821.0,5.287401,New York,"New York City, New York, US",2020-05-19 00:00:00,36061,20887.0,4.319876,8336817,New York
228653,194550.0,5.289031,New York,"New York City, New York, US",2020-05-20 00:00:00,36061,20934.0,4.320852,8336817,New York
228654,195675.0,5.291535,New York,"New York City, New York, US",2020-05-21 00:00:00,36061,21003.0,4.322281,8336817,New York
228655,196484.0,5.293327,New York,"New York City, New York, US",2020-05-22 00:00:00,36061,21086.0,4.323994,8336817,New York
228656,197266.0,5.295052,New York,"New York City, New York, US",2020-05-23 00:00:00,36061,21138.0,4.325064,8336817,New York


In [101]:
confirmed_mortality_df['Mortality_Rate']= confirmed_mortality_df['Mortality_Count']/confirmed_mortality_df['Confirmed_Count']

In [123]:
confirmed_mortality_df[confirmed_mortality_df.County=='New York'].tail()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State
228652,193821.0,5.287401,New York,"New York City, New York, US",2020-05-19 00:00:00,36061,20887.0,4.319876,8336817,New York
228653,194550.0,5.289031,New York,"New York City, New York, US",2020-05-20 00:00:00,36061,20934.0,4.320852,8336817,New York
228654,195675.0,5.291535,New York,"New York City, New York, US",2020-05-21 00:00:00,36061,21003.0,4.322281,8336817,New York
228655,196484.0,5.293327,New York,"New York City, New York, US",2020-05-22 00:00:00,36061,21086.0,4.323994,8336817,New York
228656,197266.0,5.295052,New York,"New York City, New York, US",2020-05-23 00:00:00,36061,21138.0,4.325064,8336817,New York


In [124]:
confirmed_mortality_df[confirmed_mortality_df.County=='New York'].head()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State
228534,0.0,0.0,New York,"New York City, New York, US",2020-01-22 00:00:00,36061,0.0,0.0,8336817,New York
228535,0.0,0.0,New York,"New York City, New York, US",2020-01-23 00:00:00,36061,0.0,0.0,8336817,New York
228536,0.0,0.0,New York,"New York City, New York, US",2020-01-24 00:00:00,36061,0.0,0.0,8336817,New York
228537,0.0,0.0,New York,"New York City, New York, US",2020-01-25 00:00:00,36061,0.0,0.0,8336817,New York
228538,0.0,0.0,New York,"New York City, New York, US",2020-01-26 00:00:00,36061,0.0,0.0,8336817,New York


In [125]:
def percent_mortality(df, mortality_col='Mortality_Count',confirmed_col='Confirmed_Count'):
    percent_mortality=[]
    mortality_list=df[mortality_col]
    confimed_list=df[confirmed_col]
    for i in range(len(confimed_list)):
        if confimed_list[i]==0:
            percent_mortality.append(0)
        else:
            percent= (mortality_list[i]/confimed_list[i])*100
            percent_mortality.append(percent)
    df['Percent_Mortality']=percent_mortality
    return df

In [126]:
confirmed_mortality_df=percent_mortality(confirmed_mortality_df, mortality_col='Mortality_Count',confirmed_col='Confirmed_Count')

In [129]:
confirmed_mortality_df.head()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State,Percent_Mortality
0,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22 00:00:00,1001,0.0,0.0,55869,Alabama,0.0
1,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-23 00:00:00,1001,0.0,0.0,55869,Alabama,0.0
2,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-24 00:00:00,1001,0.0,0.0,55869,Alabama,0.0
3,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-25 00:00:00,1001,0.0,0.0,55869,Alabama,0.0
4,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-26 00:00:00,1001,0.0,0.0,55869,Alabama,0.0


In [130]:
confirmed_mortality_df.tail()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State,Percent_Mortality
386461,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-19 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386462,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-20 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386463,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-21 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386464,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-22 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386465,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-23 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0


In [132]:
confirmed_mortality_df.to_sql('confirmed_mortality_covid', conn, index_label='id', if_exists='replace')

In [134]:
new_confirmed_df=pd.DataFrame(query_data('SELECT * FROM confirmed_covid'))

In [135]:
new_confirmed_df.tail()

Unnamed: 0,Count,Count_log,County,"County, State",Date,FIPS,State,id
386461,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-19 00:00:00,56045,Wyoming,386461
386462,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-20 00:00:00,56045,Wyoming,386462
386463,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-21 00:00:00,56045,Wyoming,386463
386464,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-22 00:00:00,56045,Wyoming,386464
386465,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-23 00:00:00,56045,Wyoming,386465


In [136]:
new_confirmed_df.describe()

Unnamed: 0,Count,Count_log,FIPS,id
count,386466.0,386466.0,386466.0,386466.0
mean,133.24234,0.552613,30383.649268,193232.5
std,2079.615834,0.86618,15160.11492,111563.268903
min,0.0,0.0,1001.0,0.0
25%,0.0,0.0,18177.0,96616.25
50%,0.0,0.0,29176.0,193232.5
75%,10.0,1.0,45081.0,289848.75
max,197266.0,5.295052,56045.0,386465.0


In [137]:
new_confirmed_df['Count_log'].hist(bins=5)

<matplotlib.axes._subplots.AxesSubplot at 0x1a3bf01a90>

In [138]:
new_mortality_df=pd.DataFrame(query_data('SELECT * FROM mortality_covid'))

In [139]:
new_mortality_df.head()

Unnamed: 0,Count,Count_log,County,"County, State",Date,FIPS,Population,State,id
0,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-22 00:00:00,1001,55869,Alabama,0
1,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-23 00:00:00,1001,55869,Alabama,1
2,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-24 00:00:00,1001,55869,Alabama,2
3,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-25 00:00:00,1001,55869,Alabama,3
4,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-26 00:00:00,1001,55869,Alabama,4


In [140]:
new_mortality_df.describe()

Unnamed: 0,Count,Count_log,FIPS,Population,id
count,386466.0,386466.0,386466.0,386466.0,386466.0
mean,7.455854,0.120906,30383.649268,106603.3,193232.5
std,200.225704,0.388726,15160.11492,363295.9,111563.268903
min,0.0,0.0,1001.0,86.0,0.0
25%,0.0,0.0,18177.0,10901.0,96616.25
50%,0.0,0.0,29176.0,25726.0,193232.5
75%,0.0,0.0,45081.0,68098.0,289848.75
max,21138.0,4.325064,56045.0,10039110.0,386465.0


Get mortality data of most recent date

In [142]:
new_mortality_df['Date'].iloc[-1]

'2020-05-23 00:00:00'

In [143]:
sel_dates=list((new_mortality_df[new_mortality_df['County']=='Autauga']['Date'].iloc[-9::-7]).unique())
sel_dates

['2020-05-15 00:00:00',
 '2020-05-08 00:00:00',
 '2020-05-01 00:00:00',
 '2020-04-24 00:00:00',
 '2020-04-17 00:00:00',
 '2020-04-10 00:00:00',
 '2020-04-03 00:00:00',
 '2020-03-27 00:00:00',
 '2020-03-20 00:00:00',
 '2020-03-13 00:00:00',
 '2020-03-06 00:00:00',
 '2020-02-28 00:00:00',
 '2020-02-21 00:00:00',
 '2020-02-14 00:00:00',
 '2020-02-07 00:00:00',
 '2020-01-31 00:00:00',
 '2020-01-24 00:00:00']

In [144]:
selected_mortality_df=new_mortality_df[new_mortality_df['Date'].isin(sel_dates)]

In [145]:
selected_mortality_df.iloc[:10,:]

Unnamed: 0,Count,Count_log,County,"County, State",Date,FIPS,Population,State,id
2,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-24 00:00:00,1001,55869,Alabama,2
9,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-01-31 00:00:00,1001,55869,Alabama,9
16,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-02-07 00:00:00,1001,55869,Alabama,16
23,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-02-14 00:00:00,1001,55869,Alabama,23
30,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-02-21 00:00:00,1001,55869,Alabama,30
37,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-02-28 00:00:00,1001,55869,Alabama,37
44,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-03-06 00:00:00,1001,55869,Alabama,44
51,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-03-13 00:00:00,1001,55869,Alabama,51
58,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-03-20 00:00:00,1001,55869,Alabama,58
65,0.0,0.0,Autauga,"Autauga, Alabama, US",2020-03-27 00:00:00,1001,55869,Alabama,65


In [158]:
selected_mortality_df.drop(columns='id').to_sql('weekly_mortality_covid', conn, index='id', if_exists='replace')

In [147]:
len(selected_mortality_df['FIPS'].unique())

3142

In [148]:
selected_mortality_df.columns

Index(['Count', 'Count_log', 'County', 'County, State', 'Date', 'FIPS',
       'Population', 'State', 'id'],
      dtype='object')

In [150]:
selected_confirmed_mortality_df=confirmed_mortality_df[confirmed_mortality_df['Date'].isin(sel_dates)]

In [152]:
selected_confirmed_mortality_df.tail()

Unnamed: 0,Confirmed_Count,Confirmed_Count_Log,County,"County, State",Date,FIPS,Mortality_Count,Mortality_Count_Log,Population,State,Percent_Mortality
386429,0.0,0.0,Weston,"Weston, Wyoming, US",2020-04-17 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386436,0.0,0.0,Weston,"Weston, Wyoming, US",2020-04-24 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386443,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-01 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386450,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-08 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0
386457,0.0,0.0,Weston,"Weston, Wyoming, US",2020-05-15 00:00:00,56045,0.0,0.0,6927,Wyoming,0.0


In [156]:
selected_confirmed_mortality_df.to_sql('weekly_confirmed_mortality_covid', conn, if_exists='replace')