## Feature Exploration & Cleaning Kinsa Fever Data, US Counties

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import requests
import sqlite3
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn_pandas import DataFrameMapper

In [2]:
conn=sqlite3.connect('COVID19_county_data.db')
cursor= conn.cursor()

In [3]:
#Create function to query SQL data
def query_data(sql_statement):
    df=pd.read_sql(sql_statement, conn)
    #cursor.execute(sql_statement)
    return df.to_dict('records')

In [4]:
#Apply for access to kinsa api here to receive url instructions: https://content.kinsahealth.com/us-health-weather-map-public-api-application
def get_kinsa_data(param='US'):
    #url=f'https://{THE_URL}.com/{param}_data.json' #uncomment and replace "THE_URL" with actual kinsa api url
    response=requests.get(url)
    if response.status_code == 200:
        r_list=response.json()
        column_names=r_list['columns']
        data=r_list['data']
        df=pd.DataFrame(data, columns=column_names)
        return df
    else:
        return None

In [5]:
df_NY=get_kinsa_data('NY')
df_NY.head()

Unnamed: 0,region_id,region_name,region_type,state,observed_ili,atypical_ili,atypical_ili_delta,anomaly_fevers,forecast_expected,forecast_lower,forecast_upper,date
0,36111,Ulster County,county,NY,7.811896,,0.0,0.0,,,,2020-02-16
1,36109,Tompkins County,county,NY,6.744609,,0.0,0.0,,,,2020-02-16
2,36107,Tioga County,county,NY,6.962238,,0.0,0.0,,,,2020-02-16
3,36105,Sullivan County,county,NY,7.168137,,0.0,0.0,,,,2020-02-16
4,36103,Suffolk County,county,NY,7.219955,,0.0,0.0,,,,2020-02-16


In [6]:
df_NY['region_name'].unique()

array(['Ulster County', 'Tompkins County', 'Tioga County',
       'Sullivan County', 'Suffolk County', 'Steuben County',
       'Seneca County', 'Schuyler County', 'Schoharie County',
       'Schenectady County', 'Saratoga County', 'St. Lawrence County',
       'Rockland County', 'Warren County', 'Washington County',
       'Yates County', 'Wyoming County', 'Westchester County',
       'Wayne County', 'Richmond County', 'Rensselaer County',
       'Livingston County', 'Lewis County', 'Kings County',
       'Jefferson County', 'Herkimer County', 'Hamilton County',
       'Greene County', 'Genesee County', 'Fulton County',
       'Franklin County', 'Essex County', 'Erie County',
       'Dutchess County', 'Delaware County', 'Madison County',
       'Monroe County', 'Queens County', 'Putnam County', 'Otsego County',
       'Oswego County', 'Orleans County', 'Orange County',
       'Ontario County', 'Onondaga County', 'Oneida County',
       'Niagara County', 'New York County', 'Nassau Coun

In [7]:
state_url="https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697"
df_fips=pd.read_html(state_url)[0]

In [8]:
header = df_fips.iloc[0] 
df_fips = df_fips[1:] #take the data less the header row
df_fips.columns = header
df_fips.head()

Unnamed: 0,FIPS,Name,State
1,1001,Autauga,AL
2,1003,Baldwin,AL
3,1005,Barbour,AL
4,1007,Bibb,AL
5,1009,Blount,AL


In [9]:
states_list=list(df_fips['State'].unique())
states_list[:3]

['AL', 'AK', 'AZ']

In [10]:
def compile_US_df(states_list):
    state_dicts={}
    for state in states_list:
        df=get_kinsa_data(state)
        if df is not None:
            state_dicts[state]=df
        #state_dicts.append(df)
    return state_dicts

In [11]:
states_df=pd.concat(compile_US_df(states_list), ignore_index=True)

In [12]:
states_df.head()

Unnamed: 0,region_id,region_name,region_type,state,observed_ili,atypical_ili,atypical_ili_delta,anomaly_fevers,forecast_expected,forecast_lower,forecast_upper,date
0,1001,Autauga County,county,AL,5.288498,,0.0,0.0,,,,2020-02-16
1,1133,Winston County,county,AL,5.158465,,,,,,,2020-02-16
2,1131,Wilcox County,county,AL,4.954751,,,,,,,2020-02-16
3,1129,Washington County,county,AL,4.984101,,,,,,,2020-02-16
4,1127,Walker County,county,AL,4.880931,,,,,,,2020-02-16


In [13]:
states_df.keys()

Index(['region_id', 'region_name', 'region_type', 'state', 'observed_ili',
       'atypical_ili', 'atypical_ili_delta', 'anomaly_fevers',
       'forecast_expected', 'forecast_lower', 'forecast_upper', 'date'],
      dtype='object')

In [14]:
unique_states=states_df['state'].unique()
unique_states

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
       'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [15]:
len(unique_states)

49

In [16]:
states_df.isna().sum()

region_id                  0
region_name                0
region_type                0
state                      0
observed_ili          217630
atypical_ili          516124
atypical_ili_delta    330252
anomaly_fevers        330252
forecast_expected      46635
forecast_lower         46635
forecast_upper         46635
date                       0
dtype: int64

In [17]:
states_df['date']=states_df['date'].apply(lambda x: pd.to_datetime(x))

In [18]:
states_df['date'].iloc[-1]

Timestamp('2020-08-07 00:00:00')

In [19]:
pd.to_datetime(dt.date.today())

Timestamp('2020-05-30 00:00:00')

In [20]:
states_df['date'].iloc[-1]>pd.to_datetime(dt.date.today())

True

In [21]:
future_time=states_df['date'].apply(lambda date: True if date>=pd.to_datetime(dt.date.today()) else False)

In [22]:
#Remove dates that correspond to future dates
states_df=states_df[~future_time]

In [23]:
states_df.iloc[-10:, :]

Unnamed: 0,region_id,region_name,region_type,state,observed_ili,atypical_ili,atypical_ili_delta,anomaly_fevers,forecast_expected,forecast_lower,forecast_upper,date
539346,56041,Uinta County,county,WY,0.0,,,,0.000156,0.0,0.856408,2020-05-29
539347,56019,Johnson County,county,WY,0.0,,,,9.8e-05,0.0,0.834358,2020-05-29
539348,56037,Sweetwater County,county,WY,0.0,,,,0.00027,0.0,0.826365,2020-05-29
539349,56039,Teton County,county,WY,0.0,,,,0.138593,0.0,0.98606,2020-05-29
539350,56023,Lincoln County,county,WY,0.0,,,,0.000156,0.0,0.840003,2020-05-29
539351,56001,Albany County,county,WY,0.0,,,,8.5e-05,0.0,0.831943,2020-05-29
539352,56007,Carbon County,county,WY,0.0,,,,0.000609,0.0,0.830845,2020-05-29
539353,56009,Converse County,county,WY,0.0,,,,0.11462,0.0,0.902359,2020-05-29
539354,56003,Big Horn County,county,WY,0.0,,,,0.0,0.0,0.835832,2020-05-29
539355,56005,Campbell County,county,WY,0.0,,0.0,0.0,0.119358,0.0,0.933972,2020-05-29


In [24]:
states_df.isna().sum()

region_id                  0
region_name                0
region_type                0
state                      0
observed_ili               0
atypical_ili          298494
atypical_ili_delta    197392
anomaly_fevers        197392
forecast_expected      46635
forecast_lower         46635
forecast_upper         46635
date                       0
dtype: int64

In [25]:
states_df.dtypes

region_id                     object
region_name                   object
region_type                   object
state                         object
observed_ili                 float64
atypical_ili                 float64
atypical_ili_delta           float64
anomaly_fevers               float64
forecast_expected            float64
forecast_lower               float64
forecast_upper               float64
date                  datetime64[ns]
dtype: object

In [26]:
num_df=states_df.select_dtypes('number')

In [27]:
na_num_df=num_df.loc[:, num_df.isna().any().values]

In [28]:
na_num_df.isna().sum()

atypical_ili          298494
atypical_ili_delta    197392
anomaly_fevers        197392
forecast_expected      46635
forecast_lower         46635
forecast_upper         46635
dtype: int64

In [29]:
steps=[]
for col in na_num_df.columns:
    steps.append(([col], [SimpleImputer(strategy='constant', fill_value=0)]))
    steps.append(([col], [MissingIndicator()], {'alias':f'{col}_isna'}))
steps[0:4]

[(['atypical_ili'],
  [SimpleImputer(add_indicator=False, copy=True, fill_value=0, missing_values=nan,
                 strategy='constant', verbose=0)]),
 (['atypical_ili'],
  [MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                    sparse='auto')],
  {'alias': 'atypical_ili_isna'}),
 (['atypical_ili_delta'],
  [SimpleImputer(add_indicator=False, copy=True, fill_value=0, missing_values=nan,
                 strategy='constant', verbose=0)]),
 (['atypical_ili_delta'],
  [MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                    sparse='auto')],
  {'alias': 'atypical_ili_delta_isna'})]

In [30]:
mapper=DataFrameMapper(steps, default=None, df_out=True)

In [31]:
cleaned_states_df=mapper.fit_transform(states_df)

In [32]:
cleaned_states_df.tail()

Unnamed: 0,atypical_ili,atypical_ili_isna,atypical_ili_delta,atypical_ili_delta_isna,anomaly_fevers,anomaly_fevers_isna,forecast_expected,forecast_expected_isna,forecast_lower,forecast_lower_isna,forecast_upper,forecast_upper_isna,region_id,region_name,region_type,state,observed_ili,date
539351,0.0,True,0.0,True,0.0,True,8.5e-05,False,0.0,False,0.831943,False,56001,Albany County,county,WY,0,2020-05-29 00:00:00
539352,0.0,True,0.0,True,0.0,True,0.000609,False,0.0,False,0.830845,False,56007,Carbon County,county,WY,0,2020-05-29 00:00:00
539353,0.0,True,0.0,True,0.0,True,0.11462,False,0.0,False,0.902359,False,56009,Converse County,county,WY,0,2020-05-29 00:00:00
539354,0.0,True,0.0,True,0.0,True,0.0,False,0.0,False,0.835832,False,56003,Big Horn County,county,WY,0,2020-05-29 00:00:00
539355,0.0,True,0.0,False,0.0,False,0.119358,False,0.0,False,0.933972,False,56005,Campbell County,county,WY,0,2020-05-29 00:00:00


In [33]:
cleaned_states_df['region_name']=cleaned_states_df['region_name'].apply(lambda x: x.split(' ')[0])

In [34]:
cleaned_states_df.head()

Unnamed: 0,atypical_ili,atypical_ili_isna,atypical_ili_delta,atypical_ili_delta_isna,anomaly_fevers,anomaly_fevers_isna,forecast_expected,forecast_expected_isna,forecast_lower,forecast_lower_isna,forecast_upper,forecast_upper_isna,region_id,region_name,region_type,state,observed_ili,date
0,0.0,True,0.0,False,0.0,False,0.0,True,0.0,True,0.0,True,1001,Autauga,county,AL,5.2885,2020-02-16 00:00:00
1,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1133,Winston,county,AL,5.15846,2020-02-16 00:00:00
2,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1131,Wilcox,county,AL,4.95475,2020-02-16 00:00:00
3,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1129,Washington,county,AL,4.9841,2020-02-16 00:00:00
4,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1127,Walker,county,AL,4.88093,2020-02-16 00:00:00


In [35]:
cleaned_states_df.tail()

Unnamed: 0,atypical_ili,atypical_ili_isna,atypical_ili_delta,atypical_ili_delta_isna,anomaly_fevers,anomaly_fevers_isna,forecast_expected,forecast_expected_isna,forecast_lower,forecast_lower_isna,forecast_upper,forecast_upper_isna,region_id,region_name,region_type,state,observed_ili,date
539351,0.0,True,0.0,True,0.0,True,8.5e-05,False,0.0,False,0.831943,False,56001,Albany,county,WY,0,2020-05-29 00:00:00
539352,0.0,True,0.0,True,0.0,True,0.000609,False,0.0,False,0.830845,False,56007,Carbon,county,WY,0,2020-05-29 00:00:00
539353,0.0,True,0.0,True,0.0,True,0.11462,False,0.0,False,0.902359,False,56009,Converse,county,WY,0,2020-05-29 00:00:00
539354,0.0,True,0.0,True,0.0,True,0.0,False,0.0,False,0.835832,False,56003,Big,county,WY,0,2020-05-29 00:00:00
539355,0.0,True,0.0,False,0.0,False,0.119358,False,0.0,False,0.933972,False,56005,Campbell,county,WY,0,2020-05-29 00:00:00


In [36]:
cleaned_states_df.dtypes

atypical_ili               float64
atypical_ili_isna             bool
atypical_ili_delta         float64
atypical_ili_delta_isna       bool
anomaly_fevers             float64
anomaly_fevers_isna           bool
forecast_expected          float64
forecast_expected_isna        bool
forecast_lower             float64
forecast_lower_isna           bool
forecast_upper             float64
forecast_upper_isna           bool
region_id                   object
region_name                 object
region_type                 object
state                       object
observed_ili                object
date                        object
dtype: object

In [37]:
cleaned_states_df.isna().sum()

atypical_ili               0
atypical_ili_isna          0
atypical_ili_delta         0
atypical_ili_delta_isna    0
anomaly_fevers             0
anomaly_fevers_isna        0
forecast_expected          0
forecast_expected_isna     0
forecast_lower             0
forecast_lower_isna        0
forecast_upper             0
forecast_upper_isna        0
region_id                  0
region_name                0
region_type                0
state                      0
observed_ili               0
date                       0
dtype: int64

In [38]:
cleaned_states_df['date']=cleaned_states_df['date'].astype('O')

In [39]:
cleaned_states_df['date']=cleaned_states_df['date'].apply(lambda x: pd.to_datetime(x))

In [40]:
cleaned_states_df=cleaned_states_df.rename(columns={'region_id':'FIPS'})

In [41]:
cleaned_states_df.head()

Unnamed: 0,atypical_ili,atypical_ili_isna,atypical_ili_delta,atypical_ili_delta_isna,anomaly_fevers,anomaly_fevers_isna,forecast_expected,forecast_expected_isna,forecast_lower,forecast_lower_isna,forecast_upper,forecast_upper_isna,FIPS,region_name,region_type,state,observed_ili,date
0,0.0,True,0.0,False,0.0,False,0.0,True,0.0,True,0.0,True,1001,Autauga,county,AL,5.2885,2020-02-16
1,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1133,Winston,county,AL,5.15846,2020-02-16
2,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1131,Wilcox,county,AL,4.95475,2020-02-16
3,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1129,Washington,county,AL,4.9841,2020-02-16
4,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,0.0,True,1127,Walker,county,AL,4.88093,2020-02-16


In [42]:
cleaned_states_df['FIPS'] = cleaned_states_df['FIPS'].apply(lambda x: str(int(x)).zfill(5))

In [43]:
cleaned_states_df.to_sql('kinsa_fever', conn, index_label='id', if_exists='replace')

In [44]:
kinsa_df=pd.DataFrame(query_data('SELECT * FROM kinsa_fever'))

In [45]:
kinsa_df.tail()

Unnamed: 0,FIPS,anomaly_fevers,anomaly_fevers_isna,atypical_ili,atypical_ili_delta,atypical_ili_delta_isna,atypical_ili_isna,date,forecast_expected,forecast_expected_isna,forecast_lower,forecast_lower_isna,forecast_upper,forecast_upper_isna,id,observed_ili,region_name,region_type,state
323331,56001,0.0,1,0.0,0.0,1,1,2020-05-29 00:00:00,8.5e-05,0,0.0,0,0.831943,0,539351,0.0,Albany,county,WY
323332,56007,0.0,1,0.0,0.0,1,1,2020-05-29 00:00:00,0.000609,0,0.0,0,0.830845,0,539352,0.0,Carbon,county,WY
323333,56009,0.0,1,0.0,0.0,1,1,2020-05-29 00:00:00,0.11462,0,0.0,0,0.902359,0,539353,0.0,Converse,county,WY
323334,56003,0.0,1,0.0,0.0,1,1,2020-05-29 00:00:00,0.0,0,0.0,0,0.835832,0,539354,0.0,Big,county,WY
323335,56005,0.0,0,0.0,0.0,0,1,2020-05-29 00:00:00,0.119358,0,0.0,0,0.933972,0,539355,0.0,Campbell,county,WY
