In [None]:
# GET CENSUS DATA FROM THE WEBSITE AND SAVE IT TO HTML FILES

In [42]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import pathlib as pathlib

In [61]:
def get_census_data(geo_id, placetype):
    # get source code
    opts = FirefoxOptions()
    opts.add_argument("--headless")
    browser = webdriver.Firefox(options=opts)
    geotype = "county" if placetype == "county" else "place"
    gid = str(f"{geo_id:07d}")
    geoid = f"{gid[0:2]}{gid[4:]}" if placetype == "county" and gid[2:4] == "98" else f"{gid}"
    print(f"{gid} -> {geoid}")
    # the way to get the county geoid is to use state_numeric and append the county_numeric as a 3-digit value from the geo_fedcodes
    url = f"https://cbb.census.gov/cbb/#view=report&industries=00&geoType={geotype}&geoId={geoid}"
    print(url)
    browser.get(url)
    delay = 10 # seconds
    html = ""
    try:
        myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-vt8a42')))
        html = browser.page_source
    except TimeoutException:
        print("Loading took too much time!")

    # close web browser
    browser.close()
    
    return html

In [69]:
# get and save census html files
def save_census_data(geo_df):
    print(geo_df.shape[0])
    failed_lookup = False
    fail_count = 0
    pos = 1
    for index, row in geo_df.iterrows():
        print(pos)

        # update count
        pos += 1    

        # get census data from website
        fips = row['FIPS_CODE']
        place = row['PLACE_NAME']
        state = row['STATE_ABBREVIATION']
        geoid = row['GEOID']
        placetype = row['TYPE']

        # check if exists
        file_path = f'./census-html/{fips}-{state}-{place}-{geoid}.html'
        if(pathlib.Path(file_path).is_file()):
            print('File exists')
            continue

        # retrieve and save data
        html = get_census_data(geoid, placetype)
        if(not html):
            failed_lookup = True
            fail_count += 1
        else:
            f = open(file_path, 'a')
            f.write(html)
            f.close()
        
    if(failed_lookup):
        print(f"done with {fail_count} failures. Rate: {(fail_count/geo_df.shape[0])}. rerun lookup")
    else:
        print("done!")

    return(failed_lookup)

In [45]:
geo_data = pd.read_csv("geo_data.csv")
# ID
# GEOID
# PLACE_NAME
# PLACE_ID
# FIPS_CODE
# TYPE
# STATE_NAME
# STATE_ABBREVIATION

In [46]:
a = '''
1 - #AL done
2 - #AK done
4 - #AZ done
5 - #AR done
6 - #CA done
8 - #CO done
9 - #CT done
10 - #DE done
11 - #DC done
12 - #FL done
13 - #GA done
15 - #HI done
16 - #ID done
17 - #IL done
18 - #IN done
19 - #IA done
20 - #KS done
21 - #KY done
22 - #LA done
23 - #ME done
24 - #MD done
25 - #MA done
26 - #MI done
27 - #MN done
28 - #MS done
29 - #MO done
30 - #MT done
31 - #NE done
32 - #NV done
33 - #NH done
34 - #NJ done
35 - #NM done
36 - #NY done
37 - #NC done
38 - #ND done
39 - #OH done
40 - #OK done
41 - #OR done
42 - #PA done
44 - #RI done
45 - #SC done
46 - #SD done
47 - #TN done
48 - #TX done
49 - #UT done
50 - #VT done
51 - #VA done
53 - #WA done
54 - #WV done
55 - #WI done
56 - #WY done
'''

In [47]:
# DOWNLOAD CENSUS DATA ######################

In [74]:
# get and save census html files
# skip census designated places (CDP) and counties due to challenge with finding correct URL
# Retry (files too small) 
state_list = range(22, 57)
#state_list = [15]
for i in state_list:
    if((i == 43) | (i == 52)):
        continue
    
    partial_data = geo_data[(geo_data.FIPS_CODE == i) & (geo_data.TYPE != 'cdp') & (geo_data.TYPE != 'county')]
    keep_trying = True
    while keep_trying:
        keep_trying = save_census_data(partial_data)

170
1
File exists
2
File exists
3
File exists
4
File exists
5
File exists
6
File exists
7
File exists
8
File exists
9
File exists
10
File exists
11
File exists
12
File exists
13
File exists
14
File exists
15
File exists
16
File exists
17
File exists
18
File exists
19
File exists
20
File exists
21
File exists
22
File exists
23
File exists
24
File exists
25
File exists
26
File exists
27
File exists
28
File exists
29
File exists
30
File exists
31
File exists
32
File exists
33
File exists
34
File exists
35
File exists
36
File exists
37
File exists
38
File exists
39
File exists
40
File exists
41
File exists
42
File exists
43
File exists
44
File exists
45
File exists
46
File exists
47
File exists
48
File exists
49
File exists
50
File exists
51
File exists
52
File exists
53
File exists
54
File exists
55
File exists
56
File exists
57
File exists
58
File exists
59
File exists
60
File exists
61
File exists
62
File exists
63
File exists
64
File exists
65
File exists
66
File exists
67
File exists


238
File exists
239
File exists
240
File exists
241
File exists
242
File exists
243
File exists
244
File exists
245
File exists
246
File exists
247
File exists
248
File exists
249
File exists
250
File exists
251
File exists
252
File exists
253
File exists
254
File exists
255
File exists
256
File exists
257
File exists
258
File exists
259
File exists
260
File exists
261
File exists
262
File exists
263
File exists
264
File exists
265
File exists
266
File exists
267
File exists
268
File exists
269
File exists
270
File exists
271
File exists
272
File exists
273
File exists
274
File exists
275
File exists
276
File exists
277
File exists
278
File exists
279
File exists
280
File exists
281
File exists
282
File exists
283
File exists
284
File exists
285
File exists
286
File exists
287
File exists
288
File exists
289
File exists
290
File exists
291
File exists
292
File exists
293
File exists
294
File exists
295
File exists
296
File exists
297
File exists
298
File exists
299
File exists
300
File

File exists
121
File exists
122
File exists
123
File exists
124
File exists
125
File exists
126
File exists
127
File exists
128
File exists
129
File exists
130
File exists
131
File exists
132
File exists
133
File exists
134
File exists
135
File exists
136
File exists
137
File exists
138
File exists
139
File exists
140
File exists
141
File exists
142
File exists
143
File exists
144
File exists
145
File exists
146
File exists
147
File exists
148
File exists
149
File exists
150
File exists
151
File exists
152
File exists
153
File exists
154
File exists
155
File exists
156
File exists
157
File exists
158
File exists
159
File exists
160
File exists
161
File exists
162
File exists
163
File exists
164
File exists
165
File exists
166
File exists
167
File exists
168
File exists
169
File exists
170
File exists
171
File exists
172
File exists
173
File exists
174
File exists
175
File exists
176
File exists
177
File exists
178
File exists
179
File exists
180
File exists
181
File exists
182
File exi

File exists
181
File exists
182
File exists
183
File exists
184
File exists
185
File exists
186
File exists
187
File exists
188
File exists
189
File exists
190
File exists
191
File exists
192
File exists
193
File exists
194
File exists
195
File exists
196
File exists
197
File exists
198
File exists
199
File exists
200
File exists
201
File exists
202
File exists
203
File exists
204
File exists
205
File exists
206
File exists
207
File exists
208
File exists
209
File exists
210
File exists
211
File exists
212
File exists
213
File exists
214
File exists
215
File exists
216
File exists
217
File exists
218
File exists
219
File exists
220
File exists
221
File exists
222
File exists
223
File exists
224
File exists
225
File exists
226
File exists
227
File exists
228
File exists
229
File exists
230
File exists
231
File exists
232
File exists
233
File exists
234
File exists
235
File exists
236
File exists
237
File exists
238
File exists
239
File exists
240
File exists
241
File exists
242
File exi

File exists
707
File exists
708
File exists
709
File exists
710
File exists
711
File exists
712
File exists
713
File exists
714
File exists
715
File exists
716
File exists
717
File exists
718
File exists
719
File exists
720
File exists
721
File exists
722
File exists
723
File exists
724
File exists
725
File exists
726
File exists
727
File exists
728
File exists
729
File exists
730
File exists
731
File exists
732
File exists
733
File exists
734
File exists
735
File exists
736
File exists
737
File exists
738
File exists
739
File exists
740
File exists
741
File exists
742
File exists
743
File exists
744
File exists
745
File exists
746
File exists
747
File exists
748
File exists
749
File exists
750
File exists
751
File exists
752
File exists
753
File exists
754
File exists
755
File exists
756
File exists
757
File exists
758
File exists
759
File exists
760
File exists
761
File exists
762
File exists
763
File exists
764
File exists
765
File exists
766
File exists
767
File exists
768
File exi

File exists
86
File exists
87
File exists
88
File exists
89
File exists
90
File exists
91
File exists
92
File exists
93
File exists
94
File exists
95
File exists
96
File exists
97
File exists
98
File exists
99
File exists
100
File exists
101
File exists
102
File exists
103
File exists
104
File exists
105
File exists
106
File exists
107
File exists
108
File exists
109
File exists
110
File exists
111
File exists
112
File exists
113
File exists
114
File exists
115
File exists
116
File exists
117
File exists
118
File exists
119
File exists
120
File exists
121
File exists
122
File exists
123
File exists
124
File exists
125
File exists
126
File exists
127
File exists
128
File exists
129
File exists
130
File exists
131
File exists
132
File exists
133
File exists
134
File exists
135
File exists
136
File exists
137
File exists
138
File exists
139
File exists
140
File exists
141
File exists
142
File exists
143
File exists
144
File exists
145
File exists
146
File exists
147
File exists
148
File e

File exists
629
File exists
630
File exists
631
File exists
632
File exists
633
File exists
634
File exists
635
File exists
636
File exists
637
File exists
638
File exists
639
File exists
640
File exists
641
File exists
642
File exists
643
File exists
644
File exists
645
File exists
646
File exists
647
File exists
648
File exists
649
File exists
650
File exists
651
File exists
652
File exists
653
File exists
654
File exists
655
File exists
656
File exists
657
File exists
658
File exists
659
File exists
660
File exists
661
File exists
662
File exists
663
File exists
664
File exists
665
File exists
666
File exists
667
File exists
668
File exists
669
File exists
670
File exists
671
File exists
672
File exists
673
File exists
674
File exists
675
File exists
676
File exists
677
File exists
678
File exists
679
File exists
680
File exists
681
File exists
682
File exists
683
File exists
684
File exists
685
File exists
686
File exists
687
File exists
688
File exists
689
File exists
690
File exi

File exists
12
File exists
13
File exists
14
File exists
15
File exists
16
File exists
17
File exists
18
File exists
19
File exists
20
File exists
21
File exists
22
File exists
23
File exists
24
File exists
25
File exists
26
File exists
27
File exists
28
File exists
29
File exists
30
File exists
done!
152
1
File exists
2
File exists
3
File exists
4
File exists
5
File exists
6
File exists
7
File exists
8
File exists
9
File exists
10
File exists
11
File exists
12
File exists
13
File exists
14
File exists
15
File exists
16
File exists
17
File exists
18
File exists
19
File exists
20
File exists
21
File exists
22
File exists
23
File exists
24
File exists
25
File exists
26
File exists
27
File exists
28
File exists
29
File exists
30
File exists
31
File exists
32
File exists
33
File exists
34
File exists
35
File exists
36
File exists
37
File exists
38
File exists
39
File exists
40
File exists
41
File exists
42
File exists
43
File exists
44
File exists
45
File exists
46
File exists
47
File exis

File exists
2
File exists
3
File exists
4
File exists
5
File exists
6
File exists
7
File exists
8
File exists
9
File exists
10
File exists
11
File exists
12
File exists
13
File exists
14
File exists
15
File exists
16
File exists
17
File exists
18
File exists
19
File exists
20
File exists
21
File exists
22
File exists
23
File exists
24
File exists
25
File exists
26
File exists
27
File exists
28
File exists
29
File exists
30
File exists
31
File exists
32
File exists
33
File exists
34
File exists
35
File exists
36
File exists
37
File exists
38
File exists
39
File exists
40
File exists
41
File exists
42
File exists
43
File exists
44
File exists
45
File exists
46
File exists
47
File exists
48
File exists
49
File exists
50
File exists
51
File exists
52
File exists
53
File exists
54
File exists
55
File exists
56
File exists
57
File exists
58
File exists
59
File exists
60
File exists
61
File exists
62
File exists
63
File exists
64
File exists
65
File exists
66
File exists
67
File exists
68
Fil

5684925 -> 5684925
https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=5684925
done!


In [None]:
#https://cbb.census.gov/cbb/#view=report&industries=00&geoType=place&geoId=2567000