* Data adapted from collaboration between the Robert Wood Johnson Foundation and the University of Wisconsin Population Health Institute and can be found at https://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation

## Feature Exploration & Cleaning: County Health Data

In [2]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn_pandas import DataFrameMapper

In [3]:
conn=sqlite3.connect('COVID19_county_data.db')
cursor= conn.cursor()

In [4]:
#Create function to query SQL data
def query_data(sql_statement):
    df=pd.read_sql(sql_statement, conn)
    #cursor.execute(sql_statement)
    return df.to_dict('records')

In [5]:
df=pd.read_csv('County_Health_Rankings_2020.csv', skiprows=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df.shape

(3193, 520)

In [7]:
df.head()

Unnamed: 0,FIPS,State,County,# of Ranked Counties,Length of Life_Rank,Quartile,Quality of Life_Rank,Quartile.1,Health Behaviors_Rank,Quartile.2,...,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,95% CI - Low.38,95% CI - High.38,% Female,# Rural,% Rural
0,1000,Alabama,,67,,,,,,,...,4.4,3197324,65.4,48517,1,1.0,1.0,51.6,1957932.0,41.0
1,1001,Alabama,Autauga,67,5.0,1.0,11.0,1.0,9.0,1.0,...,3.0,41316,74.3,426,1,0.0,1.0,51.4,22921.0,42.0
2,1003,Alabama,Baldwin,67,3.0,1.0,2.0,1.0,5.0,1.0,...,4.6,181201,83.1,1068,1,0.0,1.0,51.5,77060.0,42.3
3,1005,Alabama,Barbour,67,23.0,2.0,55.0,4.0,54.0,4.0,...,4.3,11356,45.6,398,2,1.0,2.0,47.2,18613.0,67.8
4,1007,Alabama,Bibb,67,49.0,3.0,14.0,1.0,32.0,2.0,...,2.6,16708,74.6,57,0,0.0,1.0,46.8,15663.0,68.4


In [8]:
df['County'].isna().sum()

51

In [9]:
df=df.dropna(subset=['County']) #drop whole state values, where County is 'NaN'

In [10]:
df.head()

Unnamed: 0,FIPS,State,County,# of Ranked Counties,Length of Life_Rank,Quartile,Quality of Life_Rank,Quartile.1,Health Behaviors_Rank,Quartile.2,...,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,95% CI - Low.38,95% CI - High.38,% Female,# Rural,% Rural
1,1001,Alabama,Autauga,67,5,1,11,1,9,1,...,3.0,41316,74.3,426,1,0.0,1.0,51.4,22921.0,42.0
2,1003,Alabama,Baldwin,67,3,1,2,1,5,1,...,4.6,181201,83.1,1068,1,0.0,1.0,51.5,77060.0,42.3
3,1005,Alabama,Barbour,67,23,2,55,4,54,4,...,4.3,11356,45.6,398,2,1.0,2.0,47.2,18613.0,67.8
4,1007,Alabama,Bibb,67,49,3,14,1,32,2,...,2.6,16708,74.6,57,0,0.0,1.0,46.8,15663.0,68.4
5,1009,Alabama,Blount,67,42,3,13,1,15,1,...,9.6,50255,86.9,934,2,1.0,2.0,50.7,51562.0,90.0


In [11]:
df['Presence of Water Violation_Drinking water violations'][:3]

1    No
2    No
3    No
Name: Presence of Water Violation_Drinking water violations, dtype: object

In [12]:
col_names=df.columns
col_names[:10]

Index(['FIPS', 'State', 'County', '# of Ranked Counties',
       'Length of Life_Rank', 'Quartile', 'Quality of Life_Rank', 'Quartile.1',
       'Health Behaviors_Rank', 'Quartile.2'],
      dtype='object')

In [13]:
def column_cleaner(column_list):
    cleaned_col= [col for col in col_names if '95%' not in col if 'Quartile' not in col if 'Rank' not in col]
    return cleaned_col

In [14]:
clean_col_names=column_cleaner(col_names)
clean_col_names.remove('Unreliable')

In [15]:
clean_col_names

['FIPS',
 'State',
 'County',
 'Premature death_Deaths',
 'Years of Potential Life Lost Rate',
 'YPLL Rate (AIAN)',
 'YPLL Rate (Asian)',
 'YPLL Rate (Black)',
 'YPLL Rate (Hispanic)',
 'YPLL Rate (White)',
 '% Fair or Poor Health',
 'Average Number of Physically Unhealthy Days',
 'Average Number of Mentally Unhealthy Days',
 '% Low Birthweight',
 '% LBW (AIAN)',
 '% LBW (Asian)',
 '% LBW (Black)',
 '% LBW (Hispanic)',
 '% LBW (White)',
 '% Smokers',
 '% Adults with Obesity',
 'Food Environment Index',
 '% Physically Inactive',
 '% With Access to Exercise Opportunities',
 '% Excessive Drinking',
 '# Alcohol-Impaired Driving Deaths',
 '# Driving Deaths',
 '% Driving Deaths with Alcohol Involvement',
 '# Chlamydia Cases',
 'Chlamydia Rate',
 'Teen Birth Rate',
 'Teen Birth Rate (AIAN)',
 'Teen Birth Rate (Asian)',
 'Teen Birth Rate (Black)',
 'Teen Birth Rate (Hispanic)',
 'Teen Birth Rate (White)',
 '# Uninsured',
 '% Uninsured',
 '# Primary Care Physicians',
 'Primary Care Physicians R

In [16]:
df=df[clean_col_names]
df.head()

Unnamed: 0,FIPS,State,County,Premature death_Deaths,Years of Potential Life Lost Rate,YPLL Rate (AIAN),YPLL Rate (Asian),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),...,% Native Hawaiian/Other Pacific Islander,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,# Rural,% Rural
1,1001,Alabama,Autauga,791.0,8129.0,,,10201.0,,7886.0,...,0.1,1649,3.0,41316,74.3,426,1,51.4,22921.0,42.0
2,1003,Alabama,Baldwin,2967.0,7354.0,,,9891.0,3570.0,7436.0,...,0.1,10131,4.6,181201,83.1,1068,1,51.5,77060.0,42.3
3,1005,Alabama,Barbour,472.0,10254.0,,,12422.0,,8140.0,...,0.2,1064,4.3,11356,45.6,398,2,47.2,18613.0,67.8
4,1007,Alabama,Bibb,471.0,11978.0,,,13085.0,,12241.0,...,0.1,588,2.6,16708,74.6,57,0,46.8,15663.0,68.4
5,1009,Alabama,Blount,1085.0,11335.0,,,,,,...,0.1,5536,9.6,50255,86.9,934,2,50.7,51562.0,90.0


In [17]:
df.isna().sum().sort_values(ascending=False)

Infant Mortality Rate (AIAN)                          3135
Homicide Rate (AIAN)                                  3120
Firearm Fatalities Rate (AIAN)                        3112
Child Mortality Rate (AIAN)                           3109
Drug Overdose Mortality Rate (Asian)                  3104
Drug Overdose Mortality Rate (AIAN)                   3103
Homicide Rate (Asian)                                 3095
Suicide Rate (AIAN)                                   3091
Firearm Fatalities Rate (Asian)                       3083
Infant Mortality Rate (Asian)                         3077
MV Mortality Rate (AIAN)                              3060
MV Mortality Rate (Asian)                             3031
Suicide Rate (Asian)                                  3023
Child Mortality Rate (Asian)                          3023
YPLL Rate (AIAN)                                      2967
Homicide Rate (Hispanic)                              2921
YPLL Rate (Asian)                                     29

In [18]:
df.dtypes.value_counts()

float64    201
int64       29
object       7
dtype: int64

In [19]:
obj_df=df.select_dtypes('O')

In [20]:
obj_df.dtypes

State                                                    object
County                                                   object
Primary Care Physicians Ratio                            object
Dentist Ratio                                            object
Mental Health Provider Ratio                             object
Presence of Water Violation_Drinking water violations    object
Other Primary Care Provider Ratio                        object
dtype: object

In [21]:
obj_df.head()

Unnamed: 0,State,County,Primary Care Physicians Ratio,Dentist Ratio,Mental Health Provider Ratio,Presence of Water Violation_Drinking water violations,Other Primary Care Provider Ratio
1,Alabama,Autauga,2220:01:00,3089:01:00,4277:01:00,No,2527:01:00
2,Alabama,Baldwin,1372:01:00,2019:01:00,1038:01:00,No,1787:01:00
3,Alabama,Barbour,3159:01:00,2765:01:00,12441:1,No,1914:01:00
4,Alabama,Bibb,2061:01:00,4480:01:00,4480:01:00,No,896:01:00
5,Alabama,Blount,4463:01:00,5258:01:00,6427:01:00,No,4449:01:00


In [22]:
num_df=df.select_dtypes('number')

In [23]:
#num_df.dtypes

In [24]:
obj_df.columns

Index(['State', 'County', 'Primary Care Physicians Ratio', 'Dentist Ratio',
       'Mental Health Provider Ratio',
       'Presence of Water Violation_Drinking water violations',
       'Other Primary Care Provider Ratio'],
      dtype='object')

In [25]:
val='2076:01:00'.split(':')[0]
val

'2076'

In [26]:
def convert_ratio(ratio_col):
    ratio_col=ratio_col.apply(lambda x: float(x.split(':')[0]) if type(x)==str else x)
    return ratio_col

In [27]:
Dentist_r= convert_ratio(obj_df['Dentist Ratio'])

In [28]:
Dentist_r[:3]

1    3089.0
2    2019.0
3    2765.0
Name: Dentist Ratio, dtype: float64

In [29]:
Primary_Care_Physicians_r=convert_ratio(obj_df['Primary Care Physicians Ratio'])

In [30]:
Primary_Care_Physicians_r[:3]

1    2220.0
2    1372.0
3    3159.0
Name: Primary Care Physicians Ratio, dtype: float64

In [31]:
Other_Primary_r=convert_ratio(obj_df['Other Primary Care Provider Ratio'])

In [32]:
Other_Primary_r[:3]


1    2527.0
2    1787.0
3    1914.0
Name: Other Primary Care Provider Ratio, dtype: float64

In [33]:
Mental_Health_Provider_r=convert_ratio(obj_df['Mental Health Provider Ratio'])

In [34]:
Mental_Health_Provider_r[:3]

1     4277.0
2     1038.0
3    12441.0
Name: Mental Health Provider Ratio, dtype: float64

In [35]:
obj_df['Dentist Ratio']=Dentist_r
obj_df['Primary Care Physicians Ratio']=Primary_Care_Physicians_r
obj_df['Other Primary Care Provider Ratio']=Other_Primary_r
obj_df['Mental Health Provider Ratio']=Mental_Health_Provider_r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [36]:
obj_df.dtypes

State                                                     object
County                                                    object
Primary Care Physicians Ratio                            float64
Dentist Ratio                                            float64
Mental Health Provider Ratio                             float64
Presence of Water Violation_Drinking water violations     object
Other Primary Care Provider Ratio                        float64
dtype: object

In [37]:
obj_df.head()

Unnamed: 0,State,County,Primary Care Physicians Ratio,Dentist Ratio,Mental Health Provider Ratio,Presence of Water Violation_Drinking water violations,Other Primary Care Provider Ratio
1,Alabama,Autauga,2220.0,3089.0,4277.0,No,2527.0
2,Alabama,Baldwin,1372.0,2019.0,1038.0,No,1787.0
3,Alabama,Barbour,3159.0,2765.0,12441.0,No,1914.0
4,Alabama,Bibb,2061.0,4480.0,4480.0,No,896.0
5,Alabama,Blount,4463.0,5258.0,6427.0,No,4449.0


In [38]:
obj_df['Presence of Water Violation_Drinking water violations'].value_counts()

No     1948
Yes    1151
Name: Presence of Water Violation_Drinking water violations, dtype: int64

In [39]:
obj_df['Presence of Water Violation_Drinking water violations'].isna().sum()

43

In [40]:
yes_no_transformer= DataFrameMapper([
    (['Presence of Water Violation_Drinking water violations'], [SimpleImputer(strategy='constant', fill_value='No'), MissingIndicator(missing_values='Yes'), 
                                                        OneHotEncoder(drop='first')])
], df_out=True)

In [41]:
water_violations= yes_no_transformer.fit_transform(obj_df)['Presence of Water Violation_Drinking water violations']

In [42]:
water_violations.value_counts()

0.0    1991
1.0    1151
Name: Presence of Water Violation_Drinking water violations, dtype: int64

In [43]:
obj_df['Presence of Water Violation_Drinking water violations']=water_violations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
obj_df.dtypes

State                                                     object
County                                                    object
Primary Care Physicians Ratio                            float64
Dentist Ratio                                            float64
Mental Health Provider Ratio                             float64
Presence of Water Violation_Drinking water violations    float64
Other Primary Care Provider Ratio                        float64
dtype: object

In [45]:
obj_df.count()

State                                                    3142
County                                                   3142
Primary Care Physicians Ratio                            2995
Dentist Ratio                                            3054
Mental Health Provider Ratio                             2912
Presence of Water Violation_Drinking water violations    3142
Other Primary Care Provider Ratio                        3120
dtype: int64

In [46]:
obj_df.dtypes

State                                                     object
County                                                    object
Primary Care Physicians Ratio                            float64
Dentist Ratio                                            float64
Mental Health Provider Ratio                             float64
Presence of Water Violation_Drinking water violations    float64
Other Primary Care Provider Ratio                        float64
dtype: object

In [47]:
obj_df.head()

Unnamed: 0,State,County,Primary Care Physicians Ratio,Dentist Ratio,Mental Health Provider Ratio,Presence of Water Violation_Drinking water violations,Other Primary Care Provider Ratio
1,Alabama,Autauga,2220.0,3089.0,4277.0,0.0,2527.0
2,Alabama,Baldwin,1372.0,2019.0,1038.0,0.0,1787.0
3,Alabama,Barbour,3159.0,2765.0,12441.0,0.0,1914.0
4,Alabama,Bibb,2061.0,4480.0,4480.0,0.0,896.0
5,Alabama,Blount,4463.0,5258.0,6427.0,0.0,4449.0


In [48]:
num_df.head()

Unnamed: 0,FIPS,Premature death_Deaths,Years of Potential Life Lost Rate,YPLL Rate (AIAN),YPLL Rate (Asian),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair or Poor Health,Average Number of Physically Unhealthy Days,...,% Native Hawaiian/Other Pacific Islander,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,# Rural,% Rural
1,1001,791.0,8129.0,,,10201.0,,7886.0,21,4.7,...,0.1,1649,3.0,41316,74.3,426,1,51.4,22921.0,42.0
2,1003,2967.0,7354.0,,,9891.0,3570.0,7436.0,18,4.2,...,0.1,10131,4.6,181201,83.1,1068,1,51.5,77060.0,42.3
3,1005,472.0,10254.0,,,12422.0,,8140.0,30,5.4,...,0.2,1064,4.3,11356,45.6,398,2,47.2,18613.0,67.8
4,1007,471.0,11978.0,,,13085.0,,12241.0,19,4.6,...,0.1,588,2.6,16708,74.6,57,0,46.8,15663.0,68.4
5,1009,1085.0,11335.0,,,,,,22,4.9,...,0.1,5536,9.6,50255,86.9,934,2,50.7,51562.0,90.0


In [49]:
selected_df=pd.concat([num_df, obj_df], axis=1, sort=False)

In [50]:
selected_df.head()

Unnamed: 0,FIPS,Premature death_Deaths,Years of Potential Life Lost Rate,YPLL Rate (AIAN),YPLL Rate (Asian),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair or Poor Health,Average Number of Physically Unhealthy Days,...,% Female,# Rural,% Rural,State,County,Primary Care Physicians Ratio,Dentist Ratio,Mental Health Provider Ratio,Presence of Water Violation_Drinking water violations,Other Primary Care Provider Ratio
1,1001,791.0,8129.0,,,10201.0,,7886.0,21,4.7,...,51.4,22921.0,42.0,Alabama,Autauga,2220.0,3089.0,4277.0,0.0,2527.0
2,1003,2967.0,7354.0,,,9891.0,3570.0,7436.0,18,4.2,...,51.5,77060.0,42.3,Alabama,Baldwin,1372.0,2019.0,1038.0,0.0,1787.0
3,1005,472.0,10254.0,,,12422.0,,8140.0,30,5.4,...,47.2,18613.0,67.8,Alabama,Barbour,3159.0,2765.0,12441.0,0.0,1914.0
4,1007,471.0,11978.0,,,13085.0,,12241.0,19,4.6,...,46.8,15663.0,68.4,Alabama,Bibb,2061.0,4480.0,4480.0,0.0,896.0
5,1009,1085.0,11335.0,,,,,,22,4.9,...,50.7,51562.0,90.0,Alabama,Blount,4463.0,5258.0,6427.0,0.0,4449.0


In [51]:
selected_df.head()

Unnamed: 0,FIPS,Premature death_Deaths,Years of Potential Life Lost Rate,YPLL Rate (AIAN),YPLL Rate (Asian),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair or Poor Health,Average Number of Physically Unhealthy Days,...,% Female,# Rural,% Rural,State,County,Primary Care Physicians Ratio,Dentist Ratio,Mental Health Provider Ratio,Presence of Water Violation_Drinking water violations,Other Primary Care Provider Ratio
1,1001,791.0,8129.0,,,10201.0,,7886.0,21,4.7,...,51.4,22921.0,42.0,Alabama,Autauga,2220.0,3089.0,4277.0,0.0,2527.0
2,1003,2967.0,7354.0,,,9891.0,3570.0,7436.0,18,4.2,...,51.5,77060.0,42.3,Alabama,Baldwin,1372.0,2019.0,1038.0,0.0,1787.0
3,1005,472.0,10254.0,,,12422.0,,8140.0,30,5.4,...,47.2,18613.0,67.8,Alabama,Barbour,3159.0,2765.0,12441.0,0.0,1914.0
4,1007,471.0,11978.0,,,13085.0,,12241.0,19,4.6,...,46.8,15663.0,68.4,Alabama,Bibb,2061.0,4480.0,4480.0,0.0,896.0
5,1009,1085.0,11335.0,,,,,,22,4.9,...,50.7,51562.0,90.0,Alabama,Blount,4463.0,5258.0,6427.0,0.0,4449.0


In [52]:
selected_df.isna().sum().sort_values(ascending=False)[:10]

Infant Mortality Rate (AIAN)            3135
Homicide Rate (AIAN)                    3120
Firearm Fatalities Rate (AIAN)          3112
Child Mortality Rate (AIAN)             3109
Drug Overdose Mortality Rate (Asian)    3104
Drug Overdose Mortality Rate (AIAN)     3103
Homicide Rate (Asian)                   3095
Suicide Rate (AIAN)                     3091
Firearm Fatalities Rate (Asian)         3083
Infant Mortality Rate (Asian)           3077
dtype: int64

In [53]:
selected_df.dtypes.value_counts()

float64    206
int64       29
object       2
dtype: int64

In [54]:
num_selected_df=selected_df.select_dtypes('number')

In [55]:
num_selected_df=num_selected_df.astype('float64')

In [56]:
num_selected_df.dtypes.value_counts()

float64    235
dtype: int64

In [57]:
na_num_selected_df=num_selected_df.loc[:, num_selected_df.isna().any().values]

In [58]:
num_selected_df.shape

(3142, 235)

In [59]:
selected_df.shape

(3142, 237)

In [60]:
steps=[]
for col in na_num_selected_df.columns:
    steps.append(([col], [SimpleImputer(strategy='constant', fill_value=0)]))
    steps.append(([col], [MissingIndicator()],{'alias':f'{col}_isna'}))
steps[0:4]

[(['Premature death_Deaths'],
  [SimpleImputer(add_indicator=False, copy=True, fill_value=0, missing_values=nan,
                 strategy='constant', verbose=0)]),
 (['Premature death_Deaths'],
  [MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                    sparse='auto')],
  {'alias': 'Premature death_Deaths_isna'}),
 (['Years of Potential Life Lost Rate'],
  [SimpleImputer(add_indicator=False, copy=True, fill_value=0, missing_values=nan,
                 strategy='constant', verbose=0)]),
 (['Years of Potential Life Lost Rate'],
  [MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                    sparse='auto')],
  {'alias': 'Years of Potential Life Lost Rate_isna'})]

In [61]:
mapper=DataFrameMapper(steps, default=None, df_out=True)

In [62]:
cleaned_selected_df= mapper.fit_transform(selected_df)

In [63]:
cleaned_selected_df.head()

Unnamed: 0,Premature death_Deaths,Premature death_Deaths_isna,Years of Potential Life Lost Rate,Years of Potential Life Lost Rate_isna,YPLL Rate (AIAN),YPLL Rate (AIAN)_isna,YPLL Rate (Asian),YPLL Rate (Asian)_isna,YPLL Rate (Black),YPLL Rate (Black)_isna,...,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,State,County,Presence of Water Violation_Drinking water violations
1,791.0,False,8129.0,False,0.0,True,0.0,True,10201.0,False,...,1649,3.0,41316,74.3,426,1,51.4,Alabama,Autauga,0
2,2967.0,False,7354.0,False,0.0,True,0.0,True,9891.0,False,...,10131,4.6,181201,83.1,1068,1,51.5,Alabama,Baldwin,0
3,472.0,False,10254.0,False,0.0,True,0.0,True,12422.0,False,...,1064,4.3,11356,45.6,398,2,47.2,Alabama,Barbour,0
4,471.0,False,11978.0,False,0.0,True,0.0,True,13085.0,False,...,588,2.6,16708,74.6,57,0,46.8,Alabama,Bibb,0
5,1085.0,False,11335.0,False,0.0,True,0.0,True,0.0,True,...,5536,9.6,50255,86.9,934,2,50.7,Alabama,Blount,0


In [64]:
cleaned_selected_df.tail()

Unnamed: 0,Premature death_Deaths,Premature death_Deaths_isna,Years of Potential Life Lost Rate,Years of Potential Life Lost Rate_isna,YPLL Rate (AIAN),YPLL Rate (AIAN)_isna,YPLL Rate (Asian),YPLL Rate (Asian)_isna,YPLL Rate (Black),YPLL Rate (Black)_isna,...,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,State,County,Presence of Water Violation_Drinking water violations
3188,532.0,False,7832.0,False,0.0,True,0.0,True,0.0,True,...,6924,16.1,34145,79.3,669,2,48.5,Wyoming,Sweetwater,1
3189,109.0,False,2731.0,False,0.0,True,0.0,True,0.0,True,...,3434,14.9,18812,81.5,945,4,48.4,Wyoming,Teton,1
3190,256.0,False,7331.0,False,0.0,True,0.0,True,0.0,True,...,1875,9.2,17741,87.4,133,1,49.3,Wyoming,Uinta,1
3191,110.0,False,6586.0,False,0.0,True,0.0,True,0.0,True,...,1108,14.1,6498,82.4,25,0,49.4,Wyoming,Washakie,0
3192,89.0,False,5389.0,False,0.0,True,0.0,True,0.0,True,...,284,4.1,6267,90.0,58,1,47.1,Wyoming,Weston,0


In [65]:
cleaned_selected_df.shape

(3142, 427)

In [66]:
cleaned_selected_df.isna().sum().sort_values(ascending=False)[:10]

Presence of Water Violation_Drinking water violations    0
# Single-Parent Households_isna                          0
# Households_isna                                        0
% Single-Parent Households                               0
% Single-Parent Households_isna                          0
Annual Average Violent Crimes                            0
Annual Average Violent Crimes_isna                       0
Violent Crime Rate                                       0
Violent Crime Rate_isna                                  0
# Injury Deaths                                          0
dtype: int64

In [67]:
cleaned_selected_df.dtypes.value_counts()

float64    190
bool       190
object      47
dtype: int64

In [68]:
cleaned_selected_df['FIPS'] = cleaned_selected_df['FIPS'].apply(lambda x: str(int(x)).zfill(5))

In [79]:
cleaned_selected_df.to_sql('county_health', conn, index_label='id', if_exists='replace')

In [80]:
new_df=pd.DataFrame(query_data('SELECT * FROM county_health'))

In [81]:
new_df.head()

Unnamed: 0,# Alcohol-Impaired Driving Deaths,# Alcohol-Impaired Driving Deaths_isna,# American Indian & Alaska Native,# Asian,# Black,# Chlamydia Cases,# Chlamydia Cases_isna,# Deaths_Premature age-adjusted mortality,# Deaths_Premature age-adjusted mortality_isna,# Deaths_Suicides,...,YPLL Rate (Asian)_isna,YPLL Rate (Black),YPLL Rate (Black)_isna,YPLL Rate (Hispanic),YPLL Rate (Hispanic)_isna,YPLL Rate (White),YPLL Rate (White)_isna,Years of Potential Life Lost Rate,Years of Potential Life Lost Rate_isna,id
0,15.0,0,267,681,10755,226.0,0,791.0,0,53.0,...,1,10201.0,0,0.0,1,7886.0,0,8129.0,0,1
1,48.0,0,1684,2508,19151,691.0,0,2967.0,0,207.0,...,1,9891.0,0,3570.0,0,7436.0,0,7354.0,0,2
2,12.0,0,164,113,11951,181.0,0,472.0,0,18.0,...,1,12422.0,0,0.0,1,8140.0,0,10254.0,0,3
3,8.0,0,98,53,4731,77.0,0,471.0,0,23.0,...,1,13085.0,0,0.0,1,12241.0,0,11978.0,0,4
4,14.0,0,378,185,846,136.0,0,1085.0,0,50.0,...,1,0.0,1,0.0,1,0.0,1,11335.0,0,5
