In [124]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from collections import Counter
import requests
import json
import warnings
warnings.filterwarnings("ignore")

In [244]:
#path for data sets
#vaccination data from cdc
cdc_vaccine=os.path.join('data','cdc_vaccine.csv')

#demographic and other data from census api
census_data=os.path.join('data','census_data.csv')

#election data 
candidate_data=os.path.join('data','president_candidate.csv' )
election_data=os.path.join('data','county_statistics.csv')

In [251]:
candidate_df=pd.read_csv(candidate_data)
candidate_df

Unnamed: 0,state,county,candidate,party,total_votes,won
0,Delaware,Kent County,Joe Biden,DEM,44552,True
1,Delaware,Kent County,Donald Trump,REP,41009,False
2,Delaware,Kent County,Jo Jorgensen,LIB,1044,False
3,Delaware,Kent County,Howie Hawkins,GRN,420,False
4,Delaware,New Castle County,Joe Biden,DEM,195034,True
...,...,...,...,...,...,...
32172,Arizona,Maricopa County,Write-ins,WRI,1331,False
32173,Arizona,Mohave County,Donald Trump,REP,78535,True
32174,Arizona,Mohave County,Joe Biden,DEM,24831,False
32175,Arizona,Mohave County,Jo Jorgensen,LIB,1302,False


In [250]:

candidate_df['county'].isin(['Joe Biden'])

0        False
1        False
2        False
3        False
4        False
         ...  
32172    False
32173    False
32174    False
32175    False
32176    False
Name: county, Length: 32177, dtype: bool

In [8]:
#reading csv file from kaggle source
cdc_vaccine_df=pd.read_csv(cdc_vaccine)

#reading csv file from census api
census_data_df=pd.read_csv(census_data)

#reading csv file of election data
election_data_df=pd.read_csv(election_data)

In [9]:
#shape of data
print(f'Shape of data from cdc source is {cdc_vaccine_df.shape}')
print(f'Shape of data from census api {census_data_df.shape}')
print(f'Shape of election data {election_data_df.shape}')

Shape of data from kaggle source is (18465, 14)
Shape of data from cdc source is (1050251, 32)
Shape of data from census api (33120, 23)
Shape of election data (4867, 51)


## Cleaning datasets

### Cleaning election data 

In [223]:
#getting a copy of election data
election_df=election_data_df.copy()

In [224]:
election_df.columns

Index(['Unnamed: 0', 'county', 'state', 'percentage16_Donald_Trump',
       'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump',
       'votes16_Hillary_Clinton', 'percentage20_Donald_Trump',
       'percentage20_Joe_Biden', 'total_votes20', 'votes20_Donald_Trump',
       'votes20_Joe_Biden', 'lat', 'long', 'cases', 'deaths', 'TotalPop',
       'Men', 'Women', 'Hispanic', 'White', 'Black', 'Native', 'Asian',
       'Pacific', 'VotingAgeCitizen', 'Income', 'IncomeErr', 'IncomePerCap',
       'IncomePerCapErr', 'Poverty', 'ChildPoverty', 'Professional', 'Service',
       'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit',
       'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed',
       'PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork',
       'Unemployment'],
      dtype='object')

In [225]:
#selecting only certain columns
election_df=election_df[['county', 'state',  'percentage20_Donald_Trump',
       'percentage20_Joe_Biden']]

In [226]:
#rename columns 
election_df.rename(columns={'county':'County',
                            'state':'State',
                            'percentage20_Donald_Trump':'Republic', 
                           'percentage20_Joe_Biden':'Democrate'}, inplace=True)

In [227]:
#adding columns win to see whether democrate or repbublican win
election_df['Win']=election_df['Democrate'].apply(lambda x:'D' if x>0.5  else 'R')

In [228]:
election_df['County']=election_df['County'].apply(lambda x: x+ ' '+ "County")

In [229]:
#dropping rows with nan
election_df=election_df.dropna(how='any')

In [239]:
election_df=election_df.sort_values('County', ascending=True)
election_df.County.value_counts()

Washington County    33
Jefferson County     28
Lincoln County       28
Franklin County      27
Jackson County       26
                     ..
Hallowell County      1
Hamblen County        1
Hamden County         1
Hamlin County         1
Ziebach County        1
Name: County, Length: 2697, dtype: int64

In [240]:
census_df.County.value_counts()

Washington County       30
Jefferson County        25
Franklin County         24
Lincoln County          23
Jackson County          23
                        ..
Guernsey County          1
Guaynabo Municipio       1
Guayanilla Municipio     1
Guayama Municipio        1
Ziebach County           1
Name: County, Length: 1950, dtype: int64

In [242]:
vaccine_oct_df.County.value_counts()

Unknown County        59
Washington County     30
Jefferson County      25
Franklin County       24
Lincoln County        23
                      ..
Grand Isle County      1
Pickaway County        1
Hampton County         1
Rensselaer County      1
Walla Walla County     1
Name: County, Length: 1960, dtype: int64

In [238]:
#saving clean data
election_df.to_csv("cleaned_data/election_data.csv", encoding="utf-8", index=False)

In [232]:
election_df.columns

Index(['County', 'State', 'Republic', 'Democrate', 'Win'], dtype='object')

In [233]:
election_df

Unnamed: 0,County,State,Republic,Democrate,Win
0,Abbeville County,SC,0.661,0.330,R
3111,Abbot County,ME,0.691,0.290,R
3112,Abington County,MA,0.439,0.537,D
1,Acadia County,LA,0.795,0.191,R
2,Accomack County,VA,0.542,0.447,R
...,...,...,...,...,...
3107,Yuma County,CO,0.826,0.156,R
3106,Yuma County,AZ,0.523,0.461,R
3108,Zapata County,TX,0.525,0.471,R
3109,Zavala County,TX,0.340,0.654,D


### Cleaning census data 

In [72]:
#getting a copy of election data
census_df=census_data_df.copy()

In [73]:
census_df.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng', 'State'],
      dtype='object')

In [74]:
#selecting only certain columns
census_df=census_df[['City', 'County', 'State','Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate' ]]

In [149]:
#grouping by county
census_df=(census_df.groupby(['County','State']).agg({
                                            'Population':"mean",
                                            'Median Age':"mean", 
                                            'Household Income':"mean", 
                                            'Poverty Rate':"mean", 
                                            'Unemployment Rate':"mean",
                                            'High School Rate':"mean",
                                            'College Rate':"mean", 
                                            'Uneducated Rate':"mean",
                                            'White Population Rate':"mean",
                                            'Black Population Rate':"mean", 
                                            'Hispanic Population Rate':"mean",
                                            'Asian Population Rate':"mean"}).reset_index())

In [152]:
#dropping nan value
census_df=census_df.dropna(how='any')

In [188]:
census_df

Unnamed: 0,County,State,Population,Median Age,Household Income,Poverty Rate,Unemployment Rate,High School Rate,College Rate,Uneducated Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate
0,Aasco Municipio,PR,26686.000000,42.700000,1.989800e+04,45.731844,3.855954,19.253541,10.488646,2.787979,82.597617,1.727498,95.061081,0.000000
1,Abbeville County,SC,4084.200000,39.900000,3.914320e+04,22.958482,2.146448,20.175639,6.307386,0.731876,65.861217,30.431803,1.097323,0.523793
2,Acadia Parish,LA,6208.700000,36.360000,4.355100e+04,20.894436,3.432778,23.095957,7.351045,0.985701,86.836943,10.600556,2.720125,0.061823
3,Accomack County,VA,990.090909,48.833333,-1.009644e+08,14.984460,1.492080,26.353785,8.400832,0.576869,73.265729,24.703698,5.307683,0.425414
4,Ada County,ID,30571.600000,37.760000,6.859527e+04,10.538547,1.974483,11.349687,17.948706,0.390195,90.713137,1.276617,7.991648,2.490008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,Yuma County,AZ,20955.000000,38.250000,4.233260e+04,18.990481,3.769920,13.993028,5.376302,2.378759,70.144058,0.966517,66.587098,0.678975
3217,Yuma County,CO,1402.142857,40.157143,5.246343e+04,13.854494,1.191867,19.333835,11.042061,0.314244,98.336295,0.346629,14.767461,0.019532
3218,Zapata County,TX,4667.666667,31.433333,6.493133e+04,27.146749,5.689971,15.544681,3.799101,1.059885,94.897257,0.000000,97.133219,0.286540
3219,Zavala County,TX,4017.000000,32.333333,3.260533e+04,39.085347,3.397647,11.545223,3.748947,1.901093,97.666404,0.148230,90.042087,0.040763


In [None]:
census_df[[]]

In [189]:
census_df.columns

Index(['County', 'State', 'Population', 'Median Age', 'Household Income',
       'Poverty Rate', 'Unemployment Rate', 'High School Rate', 'College Rate',
       'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate'],
      dtype='object')

In [212]:
census_df=census_df.sort_values('County', ascending=True)
census_df

Unnamed: 0,County,State,Population,Median Age,Household Income,Poverty Rate,Unemployment Rate,High School Rate,College Rate,Uneducated Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate
0,Aasco Municipio,PR,26686.000000,42.700000,1.989800e+04,45.731844,3.855954,19.253541,10.488646,2.787979,82.597617,1.727498,95.061081,0.000000
1,Abbeville County,SC,4084.200000,39.900000,3.914320e+04,22.958482,2.146448,20.175639,6.307386,0.731876,65.861217,30.431803,1.097323,0.523793
2,Acadia Parish,LA,6208.700000,36.360000,4.355100e+04,20.894436,3.432778,23.095957,7.351045,0.985701,86.836943,10.600556,2.720125,0.061823
3,Accomack County,VA,990.090909,48.833333,-1.009644e+08,14.984460,1.492080,26.353785,8.400832,0.576869,73.265729,24.703698,5.307683,0.425414
4,Ada County,ID,30571.600000,37.760000,6.859527e+04,10.538547,1.974483,11.349687,17.948706,0.390195,90.713137,1.276617,7.991648,2.490008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,Yuma County,AZ,20955.000000,38.250000,4.233260e+04,18.990481,3.769920,13.993028,5.376302,2.378759,70.144058,0.966517,66.587098,0.678975
3217,Yuma County,CO,1402.142857,40.157143,5.246343e+04,13.854494,1.191867,19.333835,11.042061,0.314244,98.336295,0.346629,14.767461,0.019532
3218,Zapata County,TX,4667.666667,31.433333,6.493133e+04,27.146749,5.689971,15.544681,3.799101,1.059885,94.897257,0.000000,97.133219,0.286540
3219,Zavala County,TX,4017.000000,32.333333,3.260533e+04,39.085347,3.397647,11.545223,3.748947,1.901093,97.666404,0.148230,90.042087,0.040763


In [191]:
#saving clean data
census_df.to_csv("cleaned_data/census_data.csv", encoding="utf-8", index=False)

### Cleaning cdc vaccine data 

In [178]:
#getting a copy of election data
vaccine_df=cdc_vaccine_df.copy()

In [179]:
vaccine_df.columns

Index(['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State',
       'Series_Complete_Pop_Pct', 'Series_Complete_Yes',
       'Series_Complete_12Plus', 'Series_Complete_12PlusPop_Pct',
       'Series_Complete_18Plus', 'Series_Complete_18PlusPop_Pct',
       'Series_Complete_65Plus', 'Series_Complete_65PlusPop_Pct',
       'Completeness_pct', 'Administered_Dose1_Recip',
       'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_12Plus',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18Plus',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65Plus',
       'Administered_Dose1_Recip_65PlusPop_Pct', 'SVI_CTGY',
       'Series_Complete_Pop_Pct_SVI', 'Series_Complete_12PlusPop_Pct_SVI',
       'Series_Complete_18PlusPop_Pct_SVI',
       'Series_Complete_65PlusPop_Pct_SVI', 'Metro_status',
       'Series_Complete_Pop_Pct_UR_Equity',
       'Series_Complete_12PlusPop_Pct_UR_Equity',
       'Series_Complete_18PlusPop_P

In [180]:
vaccine_df=vaccine_df[['Date',  'Recip_County', 'Recip_State',
       'Series_Complete_Pop_Pct', 'Series_Complete_Yes',
        'Series_Complete_12PlusPop_Pct',
       'Series_Complete_18PlusPop_Pct',
        'Series_Complete_65PlusPop_Pct',
       'Completeness_pct', 'Administered_Dose1_Recip',
       'Administered_Dose1_Pop_Pct',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65PlusPop_Pct']]

In [197]:
vaccine_df.rename(columns={'Recip_County': 'County',
                           'Recip_State': 'State',
                           'Series_Complete_Yes':'Fully_Vaccinated',
                            'Series_Complete_12PlusPop_Pct':'Fully_Vaccinated_12+',
                           'Series_Complete_18PlusPop_Pct':'Fully_Vaccinated_18+',
                            'Series_Complete_65PlusPop_Pct':'Fully_Vaccinated_65+',
                           'Completeness_pct':'Completeness_pct', 
                           'Administered_Dose1_Pop_Pct':'Partially_Vaccinated',
                           'Administered_Dose1_Recip_12PlusPop_Pct':'Partially_Vaccinated_12+',
                           'Administered_Dose1_Recip_18PlusPop_Pct':'Partially_Vaccinated_18+',
                           'Administered_Dose1_Recip_65PlusPop_Pct':'Partially_Vaccinated_65+'}, inplace=True)

In [198]:
#convering date tot datetime 
vaccine_df['Date']=pd.to_datetime(vaccine_df['Date'])

In [199]:
#latest vaccine data only
vaccine_oct_df=vaccine_df[vaccine_df.Date== "2021-10-28"]
vaccine_oct_df

Unnamed: 0,Date,County,State,Series_Complete_Pop_Pct,Fully_Vaccinated,Fully_Vaccinated_12+,Fully_Vaccinated_18+,Fully_Vaccinated_65+,Completeness_pct,Administered_Dose1_Recip,Partially_Vaccinated,Partially_Vaccinated_12+,Partially_Vaccinated_18+,Partially_Vaccinated_65+
0,2021-10-28,Marengo County,AL,49.2,9272,57.7,60.1,78.0,92.6,11422.0,60.6,71.0,73.5,90.0
1,2021-10-28,Taylor County,KY,48.6,12516,57.3,59.3,82.4,94.4,14120.0,54.8,64.7,66.8,89.5
2,2021-10-28,Jefferson County,AR,37.7,25225,44.0,45.0,61.7,90.3,32370.0,48.4,56.4,57.2,74.0
3,2021-10-28,Musselshell County,MT,31.1,1443,35.2,37.4,53.9,95.7,1574.0,34.0,38.4,40.7,56.4
4,2021-10-28,Powder River County,MT,19.6,330,21.9,23.3,37.8,95.7,362.0,21.5,24.0,25.5,39.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3278,2021-10-28,Unknown County,NJ,0.0,379004,0.0,0.0,0.0,93.6,,0.0,0.0,0.0,0.0
3279,2021-10-28,Silver Bow County,MT,56.4,19709,65.4,66.4,87.7,95.7,21103.0,60.4,70.0,70.9,92.9
3280,2021-10-28,Walla Walla County,WA,57.2,34753,66.1,67.6,83.4,96.1,38185.0,62.8,72.6,74.2,92.4
3281,2021-10-28,Franklin County,KS,45.6,11637,53.8,56.5,83.0,93.6,12893.0,50.5,59.6,62.5,90.0


In [200]:
#Replacing nan with 0
vaccine_oct_df=vaccine_oct_df.fillna('0.0')
vaccine_oct_df

Unnamed: 0,Date,County,State,Series_Complete_Pop_Pct,Fully_Vaccinated,Fully_Vaccinated_12+,Fully_Vaccinated_18+,Fully_Vaccinated_65+,Completeness_pct,Administered_Dose1_Recip,Partially_Vaccinated,Partially_Vaccinated_12+,Partially_Vaccinated_18+,Partially_Vaccinated_65+
0,2021-10-28,Marengo County,AL,49.2,9272,57.7,60.1,78.0,92.6,11422.0,60.6,71.0,73.5,90.0
1,2021-10-28,Taylor County,KY,48.6,12516,57.3,59.3,82.4,94.4,14120.0,54.8,64.7,66.8,89.5
2,2021-10-28,Jefferson County,AR,37.7,25225,44.0,45.0,61.7,90.3,32370.0,48.4,56.4,57.2,74.0
3,2021-10-28,Musselshell County,MT,31.1,1443,35.2,37.4,53.9,95.7,1574.0,34.0,38.4,40.7,56.4
4,2021-10-28,Powder River County,MT,19.6,330,21.9,23.3,37.8,95.7,362.0,21.5,24.0,25.5,39.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3278,2021-10-28,Unknown County,NJ,0.0,379004,0.0,0.0,0.0,93.6,0.0,0.0,0.0,0.0,0.0
3279,2021-10-28,Silver Bow County,MT,56.4,19709,65.4,66.4,87.7,95.7,21103.0,60.4,70.0,70.9,92.9
3280,2021-10-28,Walla Walla County,WA,57.2,34753,66.1,67.6,83.4,96.1,38185.0,62.8,72.6,74.2,92.4
3281,2021-10-28,Franklin County,KS,45.6,11637,53.8,56.5,83.0,93.6,12893.0,50.5,59.6,62.5,90.0


In [201]:
#saving clean data
vaccine_oct_df.to_csv("cleaned_data/vaccine_data.csv", encoding="utf-8", index=False)

In [202]:
vaccine_oct_df.columns

Index(['Date', 'County', 'State', 'Series_Complete_Pop_Pct',
       'Fully_Vaccinated', 'Fully_Vaccinated_12+', 'Fully_Vaccinated_18+',
       'Fully_Vaccinated_65+', 'Completeness_pct', 'Administered_Dose1_Recip',
       'Partially_Vaccinated', 'Partially_Vaccinated_12+',
       'Partially_Vaccinated_18+', 'Partially_Vaccinated_65+'],
      dtype='object')