# US data scraping

This data is from 2000 to 2006

In [1]:
# Import of libraries

import pandas as pd 
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import support_scraping as f
import re

In [2]:
# Driver configuration

driver = 'chromedriver'
driver = webdriver.Chrome()

In [3]:
# URL and call 

url = 'https://www.cdc.gov/nchs/data/hestat/hearing00-06/hearing00-06.htm'
driver.get(url)

## Table 1

Age-adjusted percent distributions (with standard errors) of hearing status among all adults.

In [4]:
# Extraction of the whole table 

hs_adults = driver.find_element(By.XPATH, '/html/body/div[6]/main/div[3]/div/div[3]/table[1]')
hs_adults

<selenium.webdriver.remote.webelement.WebElement (session="39a7be1d8b14fd4e45f0040e138740d4", element="EBC0780C5C38B587C5619647DDDFBAA4_element_52")>

In [5]:
# Extraction of columns and body

cols = hs_adults.find_element(By.TAG_NAME, 'thead').text
rows = hs_adults.find_element(By.CSS_SELECTOR, 'tbody').find_elements(By.CSS_SELECTOR, 'tr')

row_data = []

for r in rows:
    data = [e for e in r.find_elements(By.CSS_SELECTOR, 'td')]
    
    temp = []
    for d in data:
        if d.text:
            temp.append(d.text)
            
    row_data.append(temp)
    
row_data[:2]


[['100.0', '83.7 (0.19)', '12.9 (0.16)', '3.3 (0.06)']]

In [6]:
# Results

print(cols)
print(row_data)


Total2 Good hearing A little trouble hearing Deaf or a lot of trouble hearing
[['100.0', '83.7 (0.19)', '12.9 (0.16)', '3.3 (0.06)']]


In [7]:
# Data cleaning for columns

cols = f.correction(cols)

In [8]:
# Display of the data frame

total_adults = pd.DataFrame(row_data, columns = cols).T
total_adults

Unnamed: 0,0
total,100.0
good_hearing,83.7 (0.19)
little_trouble,12.9 (0.16)
lot_of_trouble,3.3 (0.06)


## Table 2

Age-adjusted percent distributions (with standard errors) of hearing status by sex.

In [9]:
# Extraction of the whole table 

sex = driver.find_element(By.XPATH, '/html/body/div[6]/main/div[3]/div/div[3]/table[2]')
sex

<selenium.webdriver.remote.webelement.WebElement (session="39a7be1d8b14fd4e45f0040e138740d4", element="EBC0780C5C38B587C5619647DDDFBAA4_element_60")>

In [10]:
# Extraction of columns and body

#cols = sex.find_element(By.TAG_NAME, 'thead').text
rows = sex.find_element(By.CSS_SELECTOR, 'tbody').find_elements(By.CSS_SELECTOR, 'tr')
row_data = []

for r in rows:
    data = [e for e in r.find_elements(By.CSS_SELECTOR, 'td')]
    
    temp = []
    for d in data:
        if d.text:
            temp.append(d.text)
            
    row_data.append(temp)
    
row_data[:2]

[['100.0', '80.0 (0.25)', '15.6 (0.21)', '4.3 (0.09)'],
 ['100.0', '86.9 (0.17)', '10.6 (0.15)', '2.4 (0.05)']]

In [11]:
# Results

print(cols)
print(row_data)

['total', 'good_hearing', 'little_trouble', 'lot_of_trouble']
[['100.0', '80.0 (0.25)', '15.6 (0.21)', '4.3 (0.09)'], ['100.0', '86.9 (0.17)', '10.6 (0.15)', '2.4 (0.05)']]


In [12]:
# Display of the data frame

sex = pd.DataFrame(row_data)
sex.columns = cols
sex

Unnamed: 0,total,good_hearing,little_trouble,lot_of_trouble
0,100.0,80.0 (0.25),15.6 (0.21),4.3 (0.09)
1,100.0,86.9 (0.17),10.6 (0.15),2.4 (0.05)


In [13]:
# Creation of the column 'sex' and adding its values

sex['-'] = ''

sex.loc[0, '-'] = 'male'
sex.loc[1, '-'] = 'female'

In [14]:
# Ordering the values

sex = sex[['-', 'total', 'good_hearing', 'little_trouble', 'lot_of_trouble']].T
sex


Unnamed: 0,0,1
-,male,female
total,100.0,100.0
good_hearing,80.0 (0.25),86.9 (0.17)
little_trouble,15.6 (0.21),10.6 (0.15)
lot_of_trouble,4.3 (0.09),2.4 (0.05)


## Table 3

Percent distributions (with standard errors) of hearing status by age.

In [15]:
# Extraction of the whole table 

age = driver.find_element(By.XPATH, '/html/body/div[6]/main/div[3]/div/div[3]/table[3]')
age

<selenium.webdriver.remote.webelement.WebElement (session="39a7be1d8b14fd4e45f0040e138740d4", element="EBC0780C5C38B587C5619647DDDFBAA4_element_72")>

In [16]:
# Extraction of columns and body

rows = age.find_element(By.CSS_SELECTOR, 'tbody').find_elements(By.CSS_SELECTOR, 'tr')
row_data = []

for r in rows:
    data = [e for e in r.find_elements(By.CSS_SELECTOR, 'td')]
    
    temp = []
    for d in data:
        if d.text:
            temp.append(d.text)
            
    row_data.append(temp)
    
row_data[:3]

[['100.0', '92.4 (0.15)', '6.7 (0.14)', '0.9 (0.04)'],
 ['100.0', '81.3 (0.29)', '15.6 (0.25)', '3.1 (0.09)'],
 ['100.0', '60.8 (0.39)', '27.8 (0.32)', '11.1 (0.21)']]

In [17]:
# Results

print(cols)
print(row_data)

['total', 'good_hearing', 'little_trouble', 'lot_of_trouble']
[['100.0', '92.4 (0.15)', '6.7 (0.14)', '0.9 (0.04)'], ['100.0', '81.3 (0.29)', '15.6 (0.25)', '3.1 (0.09)'], ['100.0', '60.8 (0.39)', '27.8 (0.32)', '11.1 (0.21)']]


In [18]:
# Display of the data frame

age = pd.DataFrame(row_data)
age.columns = cols
age

Unnamed: 0,total,good_hearing,little_trouble,lot_of_trouble
0,100.0,92.4 (0.15),6.7 (0.14),0.9 (0.04)
1,100.0,81.3 (0.29),15.6 (0.25),3.1 (0.09)
2,100.0,60.8 (0.39),27.8 (0.32),11.1 (0.21)


In [19]:
# Creation of the columns and adding its values

age['-'] = ''

age.loc[0, '-'] = 'age:14-44'
age.loc[1, '-'] = 'age:45-64'
age.loc[2, '-'] = 'age:65-'

In [20]:
# Ordering the values

age = age[['-', 'total', 'good_hearing', 'little_trouble', 'lot_of_trouble']].T
age


Unnamed: 0,0,1,2
-,age:14-44,age:45-64,age:65-
total,100.0,100.0,100.0
good_hearing,92.4 (0.15),81.3 (0.29),60.8 (0.39)
little_trouble,6.7 (0.14),15.6 (0.25),27.8 (0.32)
lot_of_trouble,0.9 (0.04),3.1 (0.09),11.1 (0.21)
