In [None]:
# GET CENSUS DATA FROM THE WEBSITE AND SAVE IT TO HTML FILES

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import pathlib as pathlib

In [2]:
def get_census_data(geo_id, placetype):
    # get source code
    opts = FirefoxOptions()
    opts.add_argument("--headless")
    browser = webdriver.Firefox(options=opts)
    geotype = "county" if placetype == "county" else "place"
    gid = str(f"{geo_id:07d}")
    geoid = f"{gid[0:2]}{gid[4:]}" if placetype == "county" and gid[2:4] == "98" else f"{gid}"
    print(f"{gid} -> {geoid}")
    # the way to get the county geoid is to use state_numeric and append the county_numeric as a 3-digit value from the geo_fedcodes
    url = f"https://cbb.census.gov/cbb/#view=report&industries=00&geoType={geotype}&geoId={geoid}"
    print(url)
    browser.get(url)
    delay = 10 # seconds
    html = ""
    try:
        myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-vt8a42')))
        html = browser.page_source
    except TimeoutException:
        print("Loading took too much time!")

    # close web browser
    browser.close()
    
    return html

In [3]:
# get and save census html files
def save_census_data(geo_df):
    print(geo_df.shape[0])

    pos = 1
    for index, row in geo_df.iterrows():
        print(pos)

        # update count
        pos += 1    

        # get census data from website
        fips = row['FIPS_CODE']
        place = row['PLACE_NAME']
        state = row['STATE_ABBREVIATION']
        geoid = row['GEOID']
        placetype = row['TYPE']

        # check if exists
        file_path = f'./census-html/{fips}-{state}-{place}-{geoid}.html'
        if(pathlib.Path(file_path).is_file()):
            print('File exists')
            continue

        # retrieve and save data
        html = get_census_data(geoid, placetype)
        f = open(file_path, 'a')
        f.write(html)
        f.close()
        
    print("done!")

In [4]:
geo_data = pd.read_csv("geo_data.csv")
# ID
# GEOID
# PLACE_NAME
# PLACE_ID
# FIPS_CODE
# TYPE
# STATE_NAME
# STATE_ABBREVIATION

In [5]:
a = '''
1 - done #AL
2 - done #AK
4 - done #AZ
5 - done #AR
6 - done #CA
8 - done #CO
9 - done #CT
10 - done #DE
11 - done #DC
12 - done #FL
13 - done #GA
15 - done #HI
16 - done #ID
17 - done #IL
18 - #IN
19 - #IA
20 - #KS
21 - #KY
22 - #LA
23 - #ME
24 - #MD
25 - done #MA
26 - #MI
27 - #MN
28 - #MS
29 - #MO
30 - #MT
31 - #NE
32 - #NV
33 - #NH
34 - #NJ
35 - #NM
36 - #NY
37 - #NC
38 - #ND
39 - #OH
40 - #OK
41 - #OR
42 - #PA
44 - #RI
45 - #SC
46 - #SD
47 - #TN
48 - done #TX
49 - #UT
50 - #VT
51 - #VA
53 - #WA
54 - #WV
55 - #WI
56 - #WY
'''

In [6]:
# DOWNLOAD CENSUS DATA ######################

In [22]:
# get and save census html files
# skip census designated places (CDP) and counties due to challenge with finding correct URL
# Retry (files too small) 2
partial_data = geo_data[(geo_data.FIPS_CODE == 18) & (geo_data.TYPE != 'cdp') & (geo_data.TYPE != 'county')]
save_census_data(partial_data)

187
1
1800820 -> 1800820
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1800820
Loading took too much time!
2
1800910 -> 1800910
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1800910
Loading took too much time!
3
1801468 -> 1801468
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1801468
4
1801666 -> 1801666
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1801666
5
1802620 -> 1802620
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1802620
6
1802674 -> 1802674
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1802674
7
1802782 -> 1802782
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1802782
Loading took too much time!
8
1802800 -> 1802800
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1802800
9
1802908 -> 1802908
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place

Loading took too much time!
71
1829358 -> 1829358
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=1829358
Loading took too much time!
72


KeyboardInterrupt: 