# Columns to Keep/Delete
County Statistics:
    * county (primary key)
    * state
    * percentage20_Donald_Trump [percentage_votes_Donald_Trump]
    * percentage20_Joe_Biden [percentage_votes_Joe_Biden]
    * covid_cases [delete]
    * covid_deaths [delete]
    * TotalPop [delete]
    * NEW COLUMN: Percent of cases by pop
    * NEW COLUMN: Percent of deaths by pop
    * Income per Capita
    * Poverty
    * Child Poverty
    * Unemployment

US County Sociohealth Data
    * fips (county code)
    * State
    * County
    * percent_fair_or_poor_health (skip)
    * percent_uninshured
    * percent_vaccinated
    * high_school_graduation_rate
    * percent_some_college
    * violent_crime_rate
    * life_expectancy    

In [49]:
import pandas as pd


In [50]:
# Create filepaths and read csv files into dataframes
county_filepath = "datasources/county_statistics.csv"
sociohealth_filepath = "datasources/us_county_sociohealth_data.csv"

sociohealth_df = pd.read_csv(sociohealth_filepath)
county_stats_df = pd.read_csv(county_filepath)

In [51]:
# Delete unnecessary columns
county_stats_df = county_stats_df[['county','state','percentage20_Donald_Trump','percentage20_Joe_Biden',
                                   'cases','deaths','TotalPop','IncomePerCap','Poverty',
                                  'ChildPoverty','Unemployment']]
sociohealth_df = sociohealth_df[['fips','state','county',
                                 'percent_uninsured','percent_vaccinated','high_school_graduation_rate',
                                'percent_some_college','violent_crime_rate','life_expectancy']]

In [52]:
# Check columns are correct
county_stats_df.head()

Unnamed: 0,county,state,percentage20_Donald_Trump,percentage20_Joe_Biden,cases,deaths,TotalPop,IncomePerCap,Poverty,ChildPoverty,Unemployment
0,Abbeville,SC,0.661,0.33,805.0,17.0,24788.0,19234.0,22.7,32.1,9.4
1,Acadia,LA,0.795,0.191,3182.0,102.0,62607.0,21591.0,21.5,27.6,8.9
2,Accomack,VA,0.542,0.447,1227.0,19.0,32840.0,24266.0,19.8,31.8,5.4
3,Ada,ID,0.504,0.465,17451.0,181.0,435117.0,31642.0,11.8,13.1,4.3
4,Adair,IA,0.697,0.286,222.0,1.0,7192.0,28861.0,9.5,12.1,3.0


In [53]:
# Check columns are correct
sociohealth_df.head()

Unnamed: 0,fips,state,county,percent_uninsured,percent_vaccinated,high_school_graduation_rate,percent_some_college,violent_crime_rate,life_expectancy
0,1001,Alabama,Autauga,8.721686,41.0,90.0,62.009974,272.28222,76.879477
1,1003,Alabama,Baldwin,11.333404,44.0,86.361577,67.37162,203.660396,78.450258
2,1005,Alabama,Barbour,12.242792,37.0,81.410256,34.857649,414.277861,75.341935
3,1007,Alabama,Bibb,10.206253,38.0,83.763838,44.137353,89.349126,73.57182
4,1009,Alabama,Blount,13.360759,39.0,93.468795,53.361073,482.690611,74.145826


In [54]:
# Check dataframe lengths before dropping N/A
print(f'County Stats: {len(county_stats_df)} Sociohealth Stats: {len(sociohealth_df)}')

County Stats: 4867 Sociohealth Stats: 3144


In [55]:
county_stats_df = county_stats_df.dropna()
sociohealth_df = sociohealth_df.dropna()

In [56]:
# Dataframe lengths after dropping N/A
print(f'County Stats: {len(county_stats_df)} Sociohealth Stats: {len(sociohealth_df)}')

County Stats: 3048 Sociohealth Stats: 2826


In [57]:
# Create percent of covid cases/deaths columns
county_stats_df['percentage_covid_cases'] = county_stats_df['cases'] / county_stats_df['TotalPop']
county_stats_df['percentage_covid_deaths'] = county_stats_df['deaths'] / county_stats_df['TotalPop']

county_stats_df.head()

Unnamed: 0,county,state,percentage20_Donald_Trump,percentage20_Joe_Biden,cases,deaths,TotalPop,IncomePerCap,Poverty,ChildPoverty,Unemployment,percentage_covid_cases,percentage_covid_deaths
0,Abbeville,SC,0.661,0.33,805.0,17.0,24788.0,19234.0,22.7,32.1,9.4,0.032475,0.000686
1,Acadia,LA,0.795,0.191,3182.0,102.0,62607.0,21591.0,21.5,27.6,8.9,0.050825,0.001629
2,Accomack,VA,0.542,0.447,1227.0,19.0,32840.0,24266.0,19.8,31.8,5.4,0.037363,0.000579
3,Ada,ID,0.504,0.465,17451.0,181.0,435117.0,31642.0,11.8,13.1,4.3,0.040106,0.000416
4,Adair,IA,0.697,0.286,222.0,1.0,7192.0,28861.0,9.5,12.1,3.0,0.030868,0.000139


In [58]:
# Delete cases/deaths columns
county_stats_df = county_stats_df.drop(columns = ['cases', 'deaths', 'TotalPop'])
county_stats_df.head()

Unnamed: 0,county,state,percentage20_Donald_Trump,percentage20_Joe_Biden,IncomePerCap,Poverty,ChildPoverty,Unemployment,percentage_covid_cases,percentage_covid_deaths
0,Abbeville,SC,0.661,0.33,19234.0,22.7,32.1,9.4,0.032475,0.000686
1,Acadia,LA,0.795,0.191,21591.0,21.5,27.6,8.9,0.050825,0.001629
2,Accomack,VA,0.542,0.447,24266.0,19.8,31.8,5.4,0.037363,0.000579
3,Ada,ID,0.504,0.465,31642.0,11.8,13.1,4.3,0.040106,0.000416
4,Adair,IA,0.697,0.286,28861.0,9.5,12.1,3.0,0.030868,0.000139


In [59]:
# Rename columns to match sql database
county_stats_df = county_stats_df.rename(columns = {
    'percentage20_Donald_Trump':'percentage_votes_Donald_Trump',
    'percentage20_Joe_Biden':'percentage_votes_Joe_Biden',
    'IncomePerCap':'income_per_capita',
    'Poverty':'poverty_rate',
    'ChildPoverty':'child_poverty_rate',
    'Unemployment':'unemployment_rate' 
})
county_stats_df.head()

Unnamed: 0,county,state,percentage_votes_Donald_Trump,percentage_votes_Joe_Biden,income_per_capita,poverty_rate,child_poverty_rate,unemployment_rate,percentage_covid_cases,percentage_covid_deaths
0,Abbeville,SC,0.661,0.33,19234.0,22.7,32.1,9.4,0.032475,0.000686
1,Acadia,LA,0.795,0.191,21591.0,21.5,27.6,8.9,0.050825,0.001629
2,Accomack,VA,0.542,0.447,24266.0,19.8,31.8,5.4,0.037363,0.000579
3,Ada,ID,0.504,0.465,31642.0,11.8,13.1,4.3,0.040106,0.000416
4,Adair,IA,0.697,0.286,28861.0,9.5,12.1,3.0,0.030868,0.000139


In [60]:
sociohealth_df.head()

Unnamed: 0,fips,state,county,percent_uninsured,percent_vaccinated,high_school_graduation_rate,percent_some_college,violent_crime_rate,life_expectancy
0,1001,Alabama,Autauga,8.721686,41.0,90.0,62.009974,272.28222,76.879477
1,1003,Alabama,Baldwin,11.333404,44.0,86.361577,67.37162,203.660396,78.450258
2,1005,Alabama,Barbour,12.242792,37.0,81.410256,34.857649,414.277861,75.341935
3,1007,Alabama,Bibb,10.206253,38.0,83.763838,44.137353,89.349126,73.57182
4,1009,Alabama,Blount,13.360759,39.0,93.468795,53.361073,482.690611,74.145826


In [61]:
# Rename columns as before
sociohealth_df = sociohealth_df.rename(columns = {
    'fips':'fips_code',
    'percent_uninsured':'percentage_uninsured',
    'percent_vaccinated':'percentage_vaccinated',
    'percent_some_college':'pcercentage_some_college'
})
sociohealth_df.head()

Unnamed: 0,fips_code,state,county,percentage_uninsured,percentage_vaccinated,high_school_graduation_rate,pcercentage_some_college,violent_crime_rate,life_expectancy
0,1001,Alabama,Autauga,8.721686,41.0,90.0,62.009974,272.28222,76.879477
1,1003,Alabama,Baldwin,11.333404,44.0,86.361577,67.37162,203.660396,78.450258
2,1005,Alabama,Barbour,12.242792,37.0,81.410256,34.857649,414.277861,75.341935
3,1007,Alabama,Bibb,10.206253,38.0,83.763838,44.137353,89.349126,73.57182
4,1009,Alabama,Blount,13.360759,39.0,93.468795,53.361073,482.690611,74.145826
