In [21]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from collections import Counter
import requests
import json
import warnings
warnings.filterwarnings("ignore")

In [7]:
#path for data sets
#vaccination data of us from kaggle
kaggle_vaccine=os.path.join('data','us_state_vaccinations.csv')

#vaccination data from cdc
cdc_vaccine=os.path.join('data','cdc_vaccine.csv')

#demographic and other data from census api
census_data=os.path.join('data','census_data.csv')

#election data 
#candidate_data=os.path.join('data','president_candidate.csv' )
election_data=os.path.join('data','county_statistics.csv')

In [8]:
#reading csv file from kaggle source
kaggle_vaccine_df=pd.read_csv(kaggle_vaccine)

#reading csv file from kaggle source
cdc_vaccine_df=pd.read_csv(cdc_vaccine)

#reading csv file from census api
census_data_df=pd.read_csv(census_data)

#reading csv file of election data
#candidate_data_df=pd.read_csv(candidate_data)
election_data_df=pd.read_csv(election_data)

In [9]:
#shape of data
print(f'Shape of data from kaggle source is {kaggle_vaccine_df.shape}')
print(f'Shape of data from cdc source is {cdc_vaccine_df.shape}')
print(f'Shape of data from census api {census_data_df.shape}')
print(f'Shape of election data {election_data_df.shape}')

Shape of data from kaggle source is (18465, 14)
Shape of data from cdc source is (1050251, 32)
Shape of data from census api (33120, 23)
Shape of election data (4867, 51)


## Cleaning datasets

### Cleaning election data 

In [104]:
#getting a copy of election data
election_df=election_data_df.copy()

In [105]:
election_df.columns

Index(['Unnamed: 0', 'county', 'state', 'percentage16_Donald_Trump',
       'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump',
       'votes16_Hillary_Clinton', 'percentage20_Donald_Trump',
       'percentage20_Joe_Biden', 'total_votes20', 'votes20_Donald_Trump',
       'votes20_Joe_Biden', 'lat', 'long', 'cases', 'deaths', 'TotalPop',
       'Men', 'Women', 'Hispanic', 'White', 'Black', 'Native', 'Asian',
       'Pacific', 'VotingAgeCitizen', 'Income', 'IncomeErr', 'IncomePerCap',
       'IncomePerCapErr', 'Poverty', 'ChildPoverty', 'Professional', 'Service',
       'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit',
       'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed',
       'PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork',
       'Unemployment'],
      dtype='object')

In [106]:
#selecting only certain columns
election_df=election_df[['county', 'state',  'percentage20_Donald_Trump',
       'percentage20_Joe_Biden']]

In [107]:
#rename columns 
election_df.rename(columns={'percentage20_Donald_Trump':'Republic', 
                           'percentage20_Joe_Biden':'Democrate'}, inplace=True)

In [108]:
election_df

Unnamed: 0,county,state,Republic,Democrate
0,Abbeville,SC,0.661,0.330
1,Acadia,LA,0.795,0.191
2,Accomack,VA,0.542,0.447
3,Ada,ID,0.504,0.465
4,Adair,IA,0.697,0.286
...,...,...,...,...
4862,Valdez-Cordova Census Area,AK,,
4863,Wrangell City and Borough,AK,,
4864,Yakutat City and Borough,AK,,
4865,Yukon-Koyukuk Census Area,AK,,


In [109]:
election_df['county']=election_df['county'].apply(lambda x: x +' County')
election_df['county']

0                        Abbeville County
1                           Acadia County
2                         Accomack County
3                              Ada County
4                            Adair County
                      ...                
4862    Valdez-Cordova Census Area County
4863     Wrangell City and Borough County
4864      Yakutat City and Borough County
4865     Yukon-Koyukuk Census Area County
4866                      Doña Ana County
Name: county, Length: 4867, dtype: object

### Cleaning census data 

In [72]:
#getting a copy of election data
census_df=census_data_df.copy()

In [73]:
census_df.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng', 'State'],
      dtype='object')

In [74]:
#selecting only certain columns
census_df=census_df[['City', 'County', 'State','Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate' ]]

In [75]:
#grouping by county
census_df=(census_df.groupby(['County','State']).agg({
                                            
                                            'Population':"mean",
                                            'Median Age':"mean", 
                                            'Household Income':"mean",
                                            'Per Capita Income':"mean", 
                                            'Poverty Rate':"mean", 
                                            'Unemployment Rate':"mean",
                                            'Public Transport Rate':"mean", 
                                            'Personal Transport Rate':"mean",
                                            'Commute Time Public':"mean", 
                                            'Commute Time Car':"mean",
                                            'High School Rate':"mean",
                                            'College Rate':"mean", 
                                            'Uneducated Rate':"mean",
                                            'White Population Rate':"mean",
                                            'Black Population Rate':"mean", 
                                            'Hispanic Population Rate':"mean",
                                            'Asian Population Rate':"mean"}).reset_index())

In [92]:
census_df

Unnamed: 0,County,State,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,Public Transport Rate,Personal Transport Rate,Commute Time Public,Commute Time Car,High School Rate,College Rate,Uneducated Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate
0,Aasco Municipio,PR,26686.000000,42.700000,1.989800e+04,10572.000000,45.731844,3.855954,0.316919,74.134569,,,19.253541,10.488646,2.787979,82.597617,1.727498,95.061081,0.000000
1,Abbeville County,SC,4084.200000,39.900000,3.914320e+04,20615.800000,22.958482,2.146448,0.627321,81.398007,-666666666.0,1.140000e+03,20.175639,6.307386,0.731876,65.861217,30.431803,1.097323,0.523793
2,Acadia Parish,LA,6208.700000,36.360000,4.355100e+04,22921.500000,20.894436,3.432778,0.052825,89.523281,-666666666.0,1.173667e+04,23.095957,7.351045,0.985701,86.836943,10.600556,2.720125,0.061823
3,Accomack County,VA,990.090909,48.833333,-1.009644e+08,26556.696970,14.984460,1.492080,1.769139,76.888252,-666666666.0,-7.407153e+07,26.353785,8.400832,0.576869,73.265729,24.703698,5.307683,0.425414
4,Ada County,ID,30571.600000,37.760000,6.859527e+04,36599.933333,10.538547,1.974483,0.434869,79.577062,-166663264.0,2.272650e+05,11.349687,17.948706,0.390195,90.713137,1.276617,7.991648,2.490008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,Yuma County,AZ,20955.000000,38.250000,4.233260e+04,19678.500000,18.990481,3.769920,2.387656,80.651816,14165.0,2.727975e+05,13.993028,5.376302,2.378759,70.144058,0.966517,66.587098,0.678975
3217,Yuma County,CO,1402.142857,40.157143,5.246343e+04,27721.571429,13.854494,1.191867,0.000000,65.570358,,,19.333835,11.042061,0.314244,98.336295,0.346629,14.767461,0.019532
3218,Zapata County,TX,4667.666667,31.433333,6.493133e+04,36660.000000,27.146749,5.689971,0.000000,76.696688,-666666666.0,9.250000e+02,15.544681,3.799101,1.059885,94.897257,0.000000,97.133219,0.286540
3219,Zavala County,TX,4017.000000,32.333333,3.260533e+04,13198.666667,39.085347,3.397647,1.025641,86.417275,,,11.545223,3.748947,1.901093,97.666404,0.148230,90.042087,0.040763


### Cleaning cdc vaccine data 

In [49]:
#getting a copy of election data
vaccine_df=cdc_vaccine_df.copy()

In [50]:
vaccine_df.columns

Index(['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State',
       'Series_Complete_Pop_Pct', 'Series_Complete_Yes',
       'Series_Complete_12Plus', 'Series_Complete_12PlusPop_Pct',
       'Series_Complete_18Plus', 'Series_Complete_18PlusPop_Pct',
       'Series_Complete_65Plus', 'Series_Complete_65PlusPop_Pct',
       'Completeness_pct', 'Administered_Dose1_Recip',
       'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_12Plus',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18Plus',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65Plus',
       'Administered_Dose1_Recip_65PlusPop_Pct', 'SVI_CTGY',
       'Series_Complete_Pop_Pct_SVI', 'Series_Complete_12PlusPop_Pct_SVI',
       'Series_Complete_18PlusPop_Pct_SVI',
       'Series_Complete_65PlusPop_Pct_SVI', 'Metro_status',
       'Series_Complete_Pop_Pct_UR_Equity',
       'Series_Complete_12PlusPop_Pct_UR_Equity',
       'Series_Complete_18PlusPop_P

In [59]:
#convering date tot datetime 
vaccine_df['Date']=pd.to_datetime(vaccine_df['Date'])

In [64]:
#latest vaccine information
vaccine_oct_df=vaccine_df[vaccine_df.Date== "2021-10-28"]
vaccine_oct_df

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Series_Complete_Pop_Pct,Series_Complete_Yes,Series_Complete_12Plus,Series_Complete_12PlusPop_Pct,Series_Complete_18Plus,...,SVI_CTGY,Series_Complete_Pop_Pct_SVI,Series_Complete_12PlusPop_Pct_SVI,Series_Complete_18PlusPop_Pct_SVI,Series_Complete_65PlusPop_Pct_SVI,Metro_status,Series_Complete_Pop_Pct_UR_Equity,Series_Complete_12PlusPop_Pct_UR_Equity,Series_Complete_18PlusPop_Pct_UR_Equity,Series_Complete_65PlusPop_Pct_UR_Equity
0,2021-10-28,01091,43,Marengo County,AL,49.2,9272,9272.0,57.7,8780,...,D,15.0,16.0,16.0,16.0,Non-metro,7.0,8.0,8.0,8.0
1,2021-10-28,21217,43,Taylor County,KY,48.6,12516,12516.0,57.3,11864,...,C,11.0,12.0,12.0,12.0,Non-metro,7.0,8.0,8.0,8.0
2,2021-10-28,05069,43,Jefferson County,AR,37.7,25225,25199.0,44.0,23518,...,D,14.0,15.0,15.0,15.0,Metro,2.0,3.0,3.0,3.0
3,2021-10-28,30065,43,Musselshell County,MT,31.1,1443,1443.0,35.2,1413,...,B,6.0,6.0,6.0,7.0,Non-metro,6.0,6.0,6.0,7.0
4,2021-10-28,30075,43,Powder River County,MT,19.6,330,330.0,21.9,326,...,A,1.0,1.0,1.0,2.0,Non-metro,5.0,5.0,5.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3278,2021-10-28,UNK,43,Unknown County,NJ,0.0,379004,378977.0,0.0,332818,...,,,,,,,,,,
3279,2021-10-28,30093,43,Silver Bow County,MT,56.4,19709,19708.0,65.4,18487,...,C,12.0,12.0,12.0,12.0,Non-metro,8.0,8.0,8.0,8.0
3280,2021-10-28,53071,43,Walla Walla County,WA,57.2,34753,34752.0,66.1,32592,...,C,12.0,12.0,12.0,12.0,Metro,4.0,4.0,4.0,4.0
3281,2021-10-28,20059,43,Franklin County,KS,45.6,11637,11637.0,53.8,11015,...,A,3.0,4.0,4.0,4.0,Non-metro,7.0,8.0,8.0,8.0


In [118]:
comb_df=census_df.merge(vaccine_oct_df, how='left', left_on=['County', 'State'], right_on=['Recip_County','Recip_State'])
comb_df

Unnamed: 0,County,State,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,Public Transport Rate,Personal Transport Rate,...,SVI_CTGY,Series_Complete_Pop_Pct_SVI,Series_Complete_12PlusPop_Pct_SVI,Series_Complete_18PlusPop_Pct_SVI,Series_Complete_65PlusPop_Pct_SVI,Metro_status,Series_Complete_Pop_Pct_UR_Equity,Series_Complete_12PlusPop_Pct_UR_Equity,Series_Complete_18PlusPop_Pct_UR_Equity,Series_Complete_65PlusPop_Pct_UR_Equity
0,Aasco Municipio,PR,26686.000000,42.700000,1.989800e+04,10572.000000,45.731844,3.855954,0.316919,74.134569,...,,,,,,,,,,
1,Abbeville County,SC,4084.200000,39.900000,3.914320e+04,20615.800000,22.958482,2.146448,0.627321,81.398007,...,D,14.0,15.0,15.0,15.0,Non-metro,6.0,7.0,7.0,7.0
2,Acadia Parish,LA,6208.700000,36.360000,4.355100e+04,22921.500000,20.894436,3.432778,0.052825,89.523281,...,D,15.0,16.0,16.0,16.0,Metro,3.0,4.0,4.0,4.0
3,Accomack County,VA,990.090909,48.833333,-1.009644e+08,26556.696970,14.984460,1.492080,1.769139,76.888252,...,D,16.0,16.0,16.0,16.0,Non-metro,8.0,8.0,8.0,8.0
4,Ada County,ID,30571.600000,37.760000,6.859527e+04,36599.933333,10.538547,1.974483,0.434869,79.577062,...,A,4.0,,4.0,4.0,Metro,4.0,,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,Yuma County,AZ,20955.000000,38.250000,4.233260e+04,19678.500000,18.990481,3.769920,2.387656,80.651816,...,D,16.0,16.0,16.0,16.0,Metro,4.0,4.0,4.0,4.0
3217,Yuma County,CO,1402.142857,40.157143,5.246343e+04,27721.571429,13.854494,1.191867,0.000000,65.570358,...,C,10.0,11.0,12.0,11.0,Non-metro,6.0,7.0,8.0,7.0
3218,Zapata County,TX,4667.666667,31.433333,6.493133e+04,36660.000000,27.146749,5.689971,0.000000,76.696688,...,D,16.0,16.0,16.0,16.0,Non-metro,8.0,8.0,8.0,8.0
3219,Zavala County,TX,4017.000000,32.333333,3.260533e+04,13198.666667,39.085347,3.397647,1.025641,86.417275,...,D,15.0,16.0,16.0,16.0,Non-metro,7.0,8.0,8.0,8.0


In [122]:
df=comb_df.merge(election_df, how='left', left_on=['County', 'State'], right_on=['county','state'])
df.columns

Index(['County', 'State', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'Date', 'FIPS', 'MMWR_week', 'Recip_County',
       'Recip_State', 'Series_Complete_Pop_Pct', 'Series_Complete_Yes',
       'Series_Complete_12Plus', 'Series_Complete_12PlusPop_Pct',
       'Series_Complete_18Plus', 'Series_Complete_18PlusPop_Pct',
       'Series_Complete_65Plus', 'Series_Complete_65PlusPop_Pct',
       'Completeness_pct', 'Administered_Dose1_Recip',
       'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_12Plus',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18Plus',
       'Administered_Dose1_Recip_18PlusP