# COVID-19 vaccination project: webscraping

In [98]:
from bs4 import BeautifulSoup
import requests
import time, os
import re
import pandas as pd

In [99]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [100]:
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Get LATEST driver version for 92.0.4515
Trying to download new driver from https://chromedriver.storage.googleapis.com/92.0.4515.107/chromedriver_mac64.zip
Driver has been saved in cache [/Users/mayaremington/.wdm/drivers/chromedriver/mac64/92.0.4515.107]


## Web Scraping the CDC 

In [101]:
# using selenium to web scrape

driver.get('https://covid.cdc.gov/covid-data-tracker/#county-view')
time.sleep(1) # allow page to fully load

In [102]:
# creating object from state dropdown

state_dropdown = driver.find_element_by_xpath('//select[@id="list_select_state"]')

In [103]:
# navigating using that object

states = state_dropdown.find_elements_by_tag_name('option')

In [104]:
# converting to a workable format - a list of state names

state_list = [state.text for state in states][1:]
print(state_list)

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [105]:
len(state_list)

52

In [106]:
# helper functions  - will need these for the state/county loop

def get_vax_info(soup):
    '''
    Returns the % adults in county with at least 1 shot
    '''
    atleast_partial_vax = soup.find('div', id='people-vaccinated-information'). \
        find(text='% of Population ≥ 18 Years of Age'). \
        findNext().text.strip()
    try:
        return float(atleast_partial_vax.replace('%', ''))
    except:
        return None

def get_table_value(soup, id_name):
    ''' 
    Takes an id from a certain table on cdc website and returns the value
    as a float
    '''
    
    soup_obj = soup.find('table', id='community-characteristics-list'). \
        find('span', id = id_name)
    try:
        return float(soup_obj.text.replace(',', ''))
    except:
        return None

In [107]:
def get_cdc_data(state_list):
    '''
    Gets county-level data on vacinations
    and demographics from cdc website.
    '''

    headers = ['state', 'county', 'fips', '%_vax', 'pop_density',
              'household_size', '%_uninsured', '%_poverty_rate', 
               '%_age_65_plus']

    cdc_data = []

    # looping over states and their respective counties
    for state in state_list:
        state_dropdown.send_keys(state)
        time.sleep(1)
        county_dropdown = driver.find_element_by_xpath('//select[@id="list_select_county"]')
        counties = county_dropdown.find_elments_by_tag_name('option')
        county_list = [county.text for county in counties[1:]]
        for county in county_list:
            county_dropdown.send_keys(county)
            time.sleep(1)

            # grabbing the html so we can parse using BeautifulSoup
            soup = BeautifulSoup(driver.page_source)

            # getting the county fips
            fips = soup.find('option', text=re.compile(county)).get('value')
            fips = int(fips)

            # getting vaccination data - % adults w/ 1 or more doses 
            atleast_partial_vax = get_vax_info(soup)

            # getting additional features population density 2019 (people per sq. mile)
            pop_density = get_table_value(soup, 'population_density_2019')

            # getting average household size
            household_size = get_table_value(soup, 'avg_hh_size')

            # getting uninsured rate 2019
            uninsured = get_table_value(soup, 'percent_uninsured_2019')

            # getting poverty rate 2019
            poverty_rate = get_table_value(soup, 'poverty_rate_2019')

            # getting percent over age 65
            age_65_plus = get_table_value(soup, 'percent_65_plus')

            # putting scraped data into a dictionary
            cdc_dict = dict(zip(headers, [state, county, fips,
                                 atleast_partial_vax,
                                 pop_density, household_size,
                                 uninsured, poverty_rate,
                                 age_65_plus]))
            cdc_data.append(cdc_dict)
    return cdc_data

In [58]:
cdc_data = get_cdc_data(state_list(40:))

### The scraped data in 5 batches

In [12]:
# data for 1st 10 counties

cdc_data1 = pd.DataFrame(cdc_data)
cdc_data1.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,Alabama,Autauga County,1001,43.6,93.99,2.59,7.1,15.4,15.97
1,Alabama,Baldwin County,1003,51.9,140.41,2.61,10.2,10.6,20.98
2,Alabama,Barbour County,1005,43.9,27.89,2.49,11.2,28.9,19.69
3,Alabama,Bibb County,1007,37.0,35.98,2.99,7.9,14.0,16.67
4,Alabama,Blount County,1009,31.5,89.68,2.77,11.0,14.4,18.7


In [14]:
cdc_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386 entries, 0 to 385
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           386 non-null    object 
 1   county          386 non-null    object 
 2   fips            386 non-null    int64  
 3   %_vax           320 non-null    float64
 4   pop_density     386 non-null    float64
 5   household_size  386 non-null    float64
 6   %_uninsured     386 non-null    float64
 7   %_poverty_rate  386 non-null    float64
 8   %_age_65_plus   386 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 27.3+ KB


In [22]:
# data for 2nd 10 counties

cdc_data2 = pd.DataFrame(cdc_data)
cdc_data2.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,Georgia,Appling County,13001,11.1,36.17,2.68,15.6,23.3,17.72
1,Georgia,Atkinson County,13003,10.1,23.82,2.95,24.9,30.0,14.49
2,Georgia,Bacon County,13005,,39.3,2.65,19.0,27.2,16.82
3,Georgia,Baker County,13007,34.2,8.88,2.3,14.0,21.3,23.67
4,Georgia,Baldwin County,13009,9.4,173.52,2.54,11.6,26.2,16.48


In [29]:
cdc_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806 entries, 0 to 805
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           806 non-null    object 
 1   county          806 non-null    object 
 2   fips            806 non-null    int64  
 3   %_vax           772 non-null    float64
 4   pop_density     806 non-null    float64
 5   household_size  806 non-null    float64
 6   %_uninsured     806 non-null    float64
 7   %_poverty_rate  806 non-null    float64
 8   %_age_65_plus   806 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 56.8+ KB


In [46]:
# data for 3nd 10 counties

cdc_data3 = pd.DataFrame(cdc_data)
cdc_data3.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,Maryland,Allegany County,24001,55.2,166.79,2.3,4.8,16.4,20.55
1,Maryland,Anne Arundel County,24003,80.1,1396.38,2.65,4.7,6.0,15.02
2,Maryland,Baltimore City,24510,65.7,7331.86,2.48,7.2,21.8,14.52
3,Maryland,Baltimore County,24005,74.4,1382.73,2.58,5.6,9.2,17.56
4,Maryland,Calvert County,24009,70.3,434.0,2.85,4.2,5.1,15.48


In [47]:
cdc_data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581 entries, 0 to 580
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           581 non-null    object 
 1   county          581 non-null    object 
 2   fips            581 non-null    int64  
 3   %_vax           480 non-null    float64
 4   pop_density     581 non-null    float64
 5   household_size  581 non-null    float64
 6   %_uninsured     581 non-null    float64
 7   %_poverty_rate  581 non-null    float64
 8   %_age_65_plus   581 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 41.0+ KB


In [54]:
# data for 4th 10 counties

cdc_data4 = pd.DataFrame(cdc_data)
cdc_data4.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,New Jersey,Atlantic County,34001,71.5,474.64,2.63,9.3,14.3,18.59
1,New Jersey,Bergen County,34003,82.2,4004.56,2.72,8.3,7.0,17.66
2,New Jersey,Burlington County,34005,76.7,557.18,2.63,4.6,6.5,17.37
3,New Jersey,Camden County,34007,71.6,2288.1,2.67,7.5,12.6,16.1
4,New Jersey,Cape May County,34009,76.6,365.98,2.28,6.0,11.1,27.34


In [55]:
cdc_data4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           615 non-null    object 
 1   county          615 non-null    object 
 2   fips            615 non-null    int64  
 3   %_vax           554 non-null    float64
 4   pop_density     615 non-null    float64
 5   household_size  615 non-null    float64
 6   %_uninsured     615 non-null    float64
 7   %_poverty_rate  614 non-null    float64
 8   %_age_65_plus   615 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 43.4+ KB


In [59]:
# data for the last 12 counties

cdc_data5 = pd.DataFrame(cdc_data)
cdc_data5.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,Rhode Island,Bristol County,44001,,2008.88,2.31,2.5,7.6,20.21
1,Rhode Island,Kent County,44003,,974.65,2.36,3.7,8.0,19.37
2,Rhode Island,Newport County,44005,,801.29,2.25,4.1,8.6,22.77
3,Rhode Island,Providence County,44007,70.1,1560.32,2.55,6.4,16.2,15.62
4,Rhode Island,Washington County,44009,,381.37,2.44,3.2,9.4,21.47


In [60]:
cdc_data5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 831 entries, 0 to 830
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           831 non-null    object 
 1   county          831 non-null    object 
 2   fips            831 non-null    int64  
 3   %_vax           513 non-null    float64
 4   pop_density     831 non-null    float64
 5   household_size  831 non-null    float64
 6   %_uninsured     831 non-null    float64
 7   %_poverty_rate  831 non-null    float64
 8   %_age_65_plus   831 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 58.6+ KB


In [92]:
driver.close

<bound method WebDriver.close of <selenium.webdriver.chrome.webdriver.WebDriver (session="5b1c86aafb45d704ffa96b6b603bcab3")>>

## Combining and saving data 

In [94]:
cdc_data_total = pd.concat([cdc_data1, cdc_data2, cdc_data3, cdc_data4, cdc_data5])
cdc_data_total.head()

Unnamed: 0,state,county,fips,%_vax,pop_density,household_size,%_uninsured,%_poverty_rate,%_age_65_plus
0,Alabama,Autauga County,1001,43.6,93.99,2.59,7.1,15.4,15.97
1,Alabama,Baldwin County,1003,51.9,140.41,2.61,10.2,10.6,20.98
2,Alabama,Barbour County,1005,43.9,27.89,2.49,11.2,28.9,19.69
3,Alabama,Bibb County,1007,37.0,35.98,2.99,7.9,14.0,16.67
4,Alabama,Blount County,1009,31.5,89.68,2.77,11.0,14.4,18.7


In [95]:
cdc_data_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3219 entries, 0 to 830
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           3219 non-null   object 
 1   county          3219 non-null   object 
 2   fips            3219 non-null   int64  
 3   %_vax           2639 non-null   float64
 4   pop_density     3219 non-null   float64
 5   household_size  3219 non-null   float64
 6   %_uninsured     3219 non-null   float64
 7   %_poverty_rate  3218 non-null   float64
 8   %_age_65_plus   3219 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 251.5+ KB


In [97]:
cdc_data_total.to_csv('cdc_data.csv', index=False)