# Merge County Files

Join icu_capacity, county_area, county_health, global_mobility, kinsa_fever, and weekly_mortality_covid tables using SQL; combine into one dataframe: 'county_data_df'

In [17]:
import pandas as pd
import sqlite3

In [18]:
conn=sqlite3.connect('COVID19_county_data.db')
cursor= conn.cursor()

In [19]:
#Create function to query SQL data
def query_data(sql_statement):
    df=pd.read_sql(sql_statement, conn)
    #cursor.execute(sql_statement)
    return df.to_dict('records')

* Uncomment cells below to merge files sequentially. This may be a long process.
* First, merge non-time-dependent datasets on FIPS column and save to database as 'county_health_icu_area_census'

In [20]:
non_time_df=pd.DataFrame(query_data('''SELECT icu_capacity.*, county_area.*, county_health.*, hard_to_count.* FROM icu_capacity 
                JOIN county_area ON county_area.FIPS= icu_capacity.FIPS
                JOIN county_health ON county_health.FIPS= icu_capacity.FIPS
                JOIN hard_to_count ON hard_to_count.FIPS= icu_capacity.FIPS
               '''))

  """


In [21]:
non_time_df.head()

Unnamed: 0,# Alcohol-Impaired Driving Deaths,# Alcohol-Impaired Driving Deaths_isna,# American Indian & Alaska Native,# Asian,# Black,# Chlamydia Cases,# Chlamydia Cases_isna,# Deaths_Premature age-adjusted mortality,# Deaths_Premature age-adjusted mortality_isna,# Deaths_Suicides,...,YPLL Rate (Asian)_isna,YPLL Rate (Black),YPLL Rate (Black)_isna,YPLL Rate (Hispanic),YPLL Rate (Hispanic)_isna,YPLL Rate (White),YPLL Rate (White)_isna,Years of Potential Life Lost Rate,Years of Potential Life Lost Rate_isna,id
0,15.0,0,267,681,10755,226.0,0,791.0,0,53.0,...,1,10201.0,0,0.0,1,7886.0,0,8129.0,0,0
1,48.0,0,1684,2508,19151,691.0,0,2967.0,0,207.0,...,1,9891.0,0,3570.0,0,7436.0,0,7354.0,0,1
2,12.0,0,164,113,11951,181.0,0,472.0,0,18.0,...,1,12422.0,0,0.0,1,8140.0,0,10254.0,0,2
3,8.0,0,98,53,4731,77.0,0,471.0,0,23.0,...,1,13085.0,0,0.0,1,12241.0,0,11978.0,0,3
4,14.0,0,378,185,846,136.0,0,1085.0,0,50.0,...,1,0.0,1,0.0,1,0.0,1,11335.0,0,4


In [6]:
# non_time_df.drop(columns='id').to_sql('county_health_icu_area_census', conn, index_label='id', if_exists='replace')

* Merge global mobility report with non-time dependent features on FIPS

In [7]:
# all_mobility_df=pd.DataFrame(query_data('''SELECT global_mobility.*, county_health_icu_area_census.* FROM global_mobility
#                 JOIN county_health_icu_area_census ON county_health_icu_area_census.FIPS=global_mobility.FIPS
#                '''))

In [8]:
# all_mobility_df.drop(columns='id').to_sql('county_health_icu_area_census_mobility', conn, index_label='id', if_exists='replace')

* Merge kinsa data with global mobility and non-time features on FIPS and date

In [9]:
# all_mobility_kinsa_df=pd.DataFrame(query_data('''SELECT county_health_icu_area_census_mobility.*, kinsa_fever.* FROM county_health_icu_area_census_mobility 
#                 JOIN kinsa_fever ON county_health_icu_area_census_mobility.FIPS=kinsa_fever.FIPS AND kinsa_fever.date=county_health_icu_area_census_mobility.date
#                 '''))


In [10]:
# all_mobility_kinsa_df=all_mobility_kinsa_df.rename(columns={'state':'state_abbrv'})

In [11]:
# all_mobility_kinsa_df.drop(columns='id').to_sql('county_health_icu_area_census_mobility_kinsa', conn, index_label='id', if_exists='replace')

* Merge weekly COVID-19 confirmed and mortality data with kinsa, global mobility, and non-time features on FIPS and date

In [12]:
# all_mobility_kinsa_weekly_covid_mortality_df=pd.DataFrame(query_data('''SELECT county_health_icu_area_census_mobility_kinsa.*, weekly_confirmed_mortality_covid.* FROM county_health_icu_area_census_mobility_kinsa 
#                 JOIN weekly_confirmed_mortality_covid ON county_health_icu_area_census_mobility_kinsa.FIPS=weekly_confirmed_mortality_covid.FIPS AND weekly_confirmed_mortality_covid.Date=county_health_icu_area_census_mobility_kinsa.date
#                 '''))

In [13]:
# all_mobility_kinsa_weekly_covid_mortality_df.drop(columns=['id', 'date']).to_sql('county_health_icu_area_mobility_kinsa_weekly_confirmed_mortality_covid', conn, index_label='id', if_exists='replace')

* View final result

In [14]:
county_data_df=pd.DataFrame(query_data('''SELECT * FROM county_health_icu_area_mobility_kinsa_weekly_confirmed_mortality_covid
                                        '''))

In [15]:
#Choose parameters from each table to make sure data is present
county_data_df[['FIPS','Date','day_of_week', 'doy','# Food Insecure', 'Mortality_Count_Log', 'Land_Area_sq_mi_LND110200D', 'LowResponseScore', 'retail_and_recreation_percent_change_from_baseline', 'observed_ili']].head()

Unnamed: 0,FIPS,Date,day_of_week,doy,# Food Insecure,Mortality_Count_Log,Land_Area_sq_mi_LND110200D,LowResponseScore,retail_and_recreation_percent_change_from_baseline,observed_ili
0,1001,2020-02-21 00:00:00,Friday,52,7270,0.0,595.97,19.391667,-3.0,5.433901
1,1001,2020-02-28 00:00:00,Friday,59,7270,0.0,595.97,19.391667,12.0,3.424322
2,1001,2020-03-06 00:00:00,Friday,66,7270,0.0,595.97,19.391667,4.0,4.047624
3,1001,2020-03-13 00:00:00,Friday,73,7270,0.0,595.97,19.391667,8.0,4.448124
4,1001,2020-03-20 00:00:00,Friday,80,7270,0.0,595.97,19.391667,-24.0,4.55508
