In [None]:
# GET CENSUS DATA FROM THE WEBSITE AND SAVE IT TO HTML FILES

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import pathlib as pathlib

In [24]:
def get_census_data(geo_id, placetype):
    # get source code
    opts = FirefoxOptions()
    opts.add_argument("--headless")
    browser = webdriver.Firefox(options=opts)
    geotype = "county" if placetype == "county" else "place"
    gid = str(f"{geo_id:07d}")
    geoid = f"{gid[0:2]}{gid[4:]}" if placetype == "county" and gid[2:4] == "98" else f"{gid}"
    print(f"{gid} -> {geoid}")
    # the way to get the county geoid is to use state_numeric and append the county_numeric as a 3-digit value from the geo_fedcodes
    url = f"https://cbb.census.gov/cbb/#view=report&industries=00&geoType={geotype}&geoId={geoid}"
    print(url)
    browser.get(url)
    delay = 10 # seconds
    html = ""
    try:
        myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-vt8a42')))
        html = browser.page_source
    except TimeoutException:
        print("Loading took too much time!")

    # close web browser
    browser.close()
    
    return html

In [45]:
# get and save census html files
def save_census_data(geo_df):
    print(geo_df.shape[0])

    pos = 1
    for index, row in geo_df.iterrows():
        print(pos)

        # update count
        pos += 1    

        # get census data from website
        fips = row['FIPS_CODE']
        place = row['PLACE_NAME']
        state = row['STATE_ABBREVIATION']
        geoid = row['GEOID']
        placetype = row['TYPE']

        # check if exists
        file_path = f'./census-html/{fips}-{state}-{place}-{geoid}.html'
        if(pathlib.Path(file_path).is_file()):
            print('File exists')
            continue

        # retrieve and save data
        html = get_census_data(geoid, placetype)
        f = open(file_path, 'a')
        f.write(html)
        f.close()
        
    print("done!")

In [5]:
geo_data = pd.read_csv("geo_data.csv")
# ID
# GEOID
# PLACE_NAME
# PLACE_ID
# FIPS_CODE
# TYPE
# STATE_NAME
# STATE_ABBREVIATION

In [71]:
a = '''
1 - done
2 - done
4 - done
5 - done
6 - done
8 - done
9 - done
10 - done
11 - done
12 - done
13 - done
15
16
17
18
19
20
21
22
23
24
25 - done
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
44
45
46
47
48 - done
49
50
51
53
54
55
56
'''

In [6]:
# DOWNLOAD CENSUS DATA ######################

In [103]:
# get and save census html files
# skip census designated places (CDP) due to challenge with finding correct URL
# Retry (files too small) 2
partial_data = geo_data[(geo_data.FIPS_CODE == 13) & (geo_data.TYPE != 'cdp')]
save_census_data(partial_data)

378
1
File exists
2
File exists
3
File exists
4
File exists
5
File exists
6
File exists
7
File exists
8
File exists
9
File exists
10
File exists
11
File exists
12
File exists
13
File exists
14
File exists
15
File exists
16
File exists
17
File exists
18
File exists
19
File exists
20
File exists
21
File exists
22
File exists
23
File exists
24
File exists
25
File exists
26
File exists
27
File exists
28
File exists
29
File exists
30
File exists
31
File exists
32
File exists
33
File exists
34
File exists
35
File exists
36
File exists
37
File exists
38
File exists
39
File exists
40
File exists
41
File exists
42
File exists
43
File exists
44
File exists
45
File exists
46
File exists
47
File exists
48
File exists
49
File exists
50
File exists
51
File exists
52
File exists
53
File exists
54
File exists
55
File exists
56
File exists
57
File exists
58
File exists
59
File exists
60
File exists
61
File exists
62
File exists
63
File exists
64
File exists
65
File exists
66
File exists
67
File exists
