-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetchcfr.py
326 lines (290 loc) · 14.3 KB
/
fetchcfr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# -*- coding: utf-8 -*-
# Copyright © 2016 Climate Pty Ltd
# This script is part of SharedSDS free open source GHS classification software
# SharedSDS is free software subject to the terms of the GNU GPL v3
# Generously repaired by Clinton Hall on 9 June 2016
import os
import requests # you need to "pip install requests"
# URL for the USA DG list website
cfr_url = 'http://www.ecfr.gov/cgi-bin/text-idx?SID=9bf53f6236da46a52884a2432a775a3e&mc=true&node=pt49.2.172&rgn=div5#se49.2.172_1101'
cfrpath = "/var/www/data/ssds/us/"
if not os.path.isdir(cfrpath):
os.makedirs(cfrpath)
def fetchcfr(cfrurl=cfr_url, outfile=os.path.join(cfrpath, "cfr.html")):
""" This script fetches the page and scans it for the DG list and
writes that out in three local files being cfr.html, cfr.csv and
cfr.txt which is in a format easy to import into a database. This
all updates itself from the source whenever it is run.
The USA table has roman font for PSN and italics for descriptions
where the Orange book uses upper and lower case respectively. So we
massage it accordingly.
"""
print("\nPlease wait while the USA psn list is retrieved")
# requests is a very nice package
response = requests.get(cfrurl)
with open(outfile, "w") as page:
tables = 0
start = False
current = ""
lines = list()
# response.text contains the page source html
# split() produces a python list
biglines = response.text.split("</tr>")
for line in biglines:
line = line.strip()
if line:
if not start:
if '<p class="updated">' in line:
current = line.split('<p class="updated">')[1].split("</p>")[0]
if '<tr><td align="center" class="gpotbl_cell" scope="row"> </td>' in line:
start = True
print("Got the 172.101 list - %s" % current)
print("Tweaking spelling")
elif '<p class="gpotbl_title">List of Marine Pollutants</p>' in line:
start = True
else:
# don't start until the beginning of the DG table
continue
if start:
if '</table></div></div>' in line:
tables += 1
start = False
# stop the loop at the end of the second table
if tables == 2:
break
else:
# separate the tables
lines.append("<tr><td> </td><td>Marine pollutants (First column PP = Severe marine pollutant)</td></tr>")
print("Got down to the MARPOL section")
print("Tweaking MARPOL spelling as well")
if 'class="gpotbl_cell" scope="row"' in line:
# the entire table is in a single *very* long
# line so we split it into separate rows
line = line.replace("ALUMINUM", "ALUMINIUM")
line = line.replace("luminum", "luminium")
line = line.replace("SULF", "SULPH")
line = line.replace("sulf", "sulph")
line = line.replace("poisonous", "toxic")
line = line.replace("poison", "toxic")
rows = line.split("</tr>")
for row in rows:
# iterate through the rows list and get rid
# of extraneous stuff
row = row.replace(' class="gpotbl_cell"', '')
row = row.replace(' align="left"', '')
row = row.replace(' align="center"', '')
row = row.replace(' align="right"', '')
row = row.replace(' scope="row"', '')
row = row.replace(';font-weight:inherit', '')
row = row.replace(' ', '')
# swap the tag from italic to <desc>
row = row.replace('<span style="font-style:italic">', "<desc>")
# and the closing tag
row = row.replace("</span>", "</desc>")
# swap roman to upper-case and italic to lower
row = changecase(row)
# replace the closing </tr> tag and add \n
# then put it in an output list
lines.append("%s</tr>\n" % row)
else:
# this isn't used unless we want stuff above or
# below the DG table.
line = "%s</tr>\n" % line
# The lines list is only DG content without column headers. But
# inspection of output reveals multiple entries for the same
# substance where the difference is the Packing Group and the
# first four cells are blank. We need to fill those fields in.
# Now massage/repair the lines list into a new rows list
savesymbol = savepsn = savehclass = saveunno = ""
rows = list()
for line in lines:
# collect the values of the first four columns
symbol, psn, hclass, unno = getfourfields(line)
# if psn is not blank save the four values
if psn:
savesymbol = symbol
savepsn = psn
savehclass = hclass
saveunno = unno
else:
# if psn is blank put the most recently saved values
line = putfourfields(savesymbol, savepsn, savehclass, saveunno, line)
rows.append(line)
# finished with lines for html purposes - now to make an importable file.
# Put it in /var/www/data/ssds/us/cfr.txt
print("Writing %s" % os.path.join(cfrpath, "cfr.txt"))
with open(os.path.join(cfrpath, "cfr.txt"), 'w') as txtfile:
i = 0
for line in rows:
i += 1
line = line.replace("<tr>", "")
line = line.replace("</td>", "~")
line = line.replace("<td/>", "~")
line = line.replace("<td>", "")
line = line.replace("</tr>", "")
bits = line.split("~")
if len(bits) > 3 :
if bits[3]: # skip line if no unno
txtfile.write(line)
# finished with lines for cfr.txt purposes - now to make a csv file
# Put it in /var/www/data/ssds/us/cfr.csv
# but first give it a simple header
csvheader = '"(1)", "(2)", "(3)", "(4)", "(5)", "(6)", "(7)", "(8A", "(8B)", "(8C)", "(9A)", "(9B)", "(10A)", "(10B)", "",\n'
print("Writing %s" % os.path.join(cfrpath, "cfr.csv"))
with open(os.path.join(cfrpath, "cfr.csv"), 'w') as csvfile:
csvfile.write(csvheader)
for line in rows:
line = line.replace("<tr>", '')
line = line.replace("<td>", '"')
line = line.replace("</td>", '", ')
line = line.replace("<td/>", '"", ')
line = line.replace("</tr>", '')
csvfile.write(line)
# To make a browser viewable page with column headers reverse the
# list and add the headers and the html prefix then reverse again
# and finish off with the html suffix
rows.reverse()
# These table header rows are copied directly from the website page source
# so we put column headers on the top of the list ready to write out
header1 = '<tr><th rowspan="3">Symbols</th><th rowspan="3">Hazardous materials descriptions and proper shipping names</th><th rowspan="3">Hazard class or Division</th><th rowspan="3">Identification Numbers</th><th rowspan="3">PG</th><th rowspan="3">Label Codes</th><th rowspan="3">Special provisions<br/>(§172.102)</th><th colspan="3">(8)</th><th colspan="2">(9)</th><th colspan="2">(10)<br/>Vessel<br/>stowage</th></tr>\n'
header2 = '<tr><th colspan="3">Packaging<br/>(§173.***)</th><th colspan="2">Quantity limitations<br/>(see §§173.27 and 175.75)</th><th rowspan="2">Location</th><th rowspan="2">Other</th></tr>\n'
header3 = '<tr><th>Exceptions</th><th>Non-bulk</th><th>Bulk</th><th>Passenger aircraft/rail</th><th>Cargo aircraft only</th></tr>\n'
header4 = '<tr><td>(1)</td><td>(2)</td><td>(3)</td><td>(4)</td><td>(5)</td><td>(6)</td><td>(7)</td><td>(8A)</td><td>(8B)</td><td>(8C)</td><td>(9A)</td><td>(9B)</td><td>(10A)</td><td>(10B)</td></tr>\n'
rows.append(header4)
rows.append(header3)
rows.append(header2)
rows.append(header1)
rows.append('<html><head></head><body><table border="1px" width="96%">\n')
rows.reverse()
rows.append("</table></body></html>\n")
print("Writing %s" % outfile)
page.writelines(rows)
print("Done")
return True
def changecase(row):
"""
row is an entire <tr> row which starts with <tr> but excludes the
closing </tr> tag because we split on that in fetchcfr(). So now we
split the row on the opening <td> cell tag due to a variety of
closing cell tags such as </td> and <td/>. Doing this puts the <desc>
tag in the third cell or counting from zero, element #2
We now want that third cell to be properly cased. That means anything
between <desc> tags stays untouched and anything in that cell not
between <desc> tags is uppercased. The row is returned to the caller
exactly as received except for those changes.
"""
# split the <tr> row into <td> cells
cells = row.split("<td>") # desc can only be in cells[2]
if "<desc>" in cells[2]:
cells[2] = dedesc(cells[2])
else:
cells[2] = uppercase(cells[2])
# rows beginning G (mostly N.O.S.) require a technical chemical name
# so we add (...) which will probably be manually replaced later
if "G" in cells[1]:
cells[2] = cells[2].replace("</TD>", " (...)</td>")
# put parens around the subrisk labels in cells[6]
if len(cells) > 6:
cell6 = cells[6].replace("</td>", "")
if cell6:
cells[6] = "{0}</td>".format(labelparens(cell6))
# reassemble the row and clean up before returning it
row = "<td>".join(cells).replace("</TD>", "</td>").replace("<TD/>", "<td/>")
return row
def dedesc(cell2):
""" cell can start with <desc> or not. So there are two cases.
However, cells may have zero or more <desc> tags. We will avoid
calling this method if it is zero so that makes it 'one or more'
Startswith <desc>? --> get description out and append to list
delete the description now it is saved in the list
No? --> split on <desc>, uppercase element[0] and append to list
delete the psn now it is saved in the list
Then we are back to the first case . Feels like an iterator.
All spaces and punctuation are retained.
"""
assert "<desc>" in cell2
cell = cell2 # work with (ie decrement) a *copy* of cell2
bits = list() # this will become cell2 - proper shipping name
while True:
if cell:
if cell.startswith("<desc>"):
# split on the first closing tag
pieces = cell.split("</desc>", 1)
# now drop the opening <desc> tag
pieces[0] = pieces[0].replace("<desc>", "")
# add the lower case description
bits.append(pieces[0])
else:
# <desc> must be deeper in the text
# so the the earlier text is PSN
# PSN is in element[0] of the pieces list
pieces = cell.split("<desc>", 1)
# now put back the desc tag so the next time
# around the loop the cell starts with <desc>
# provided there is something in pieces[1]
if len(pieces) == 2:
pieces[1] = "<desc>%s" % pieces[1]
# add the uppercased PSN
bits.append(uppercase(pieces[0]))
# remove the first element
del(pieces[0])
# and reassemble the remaining pieces if any
# if no pieces cell will be empty and thus "False"
cell = "".join(pieces)
# and loop back to see if there is more to do
else:
# no more text to process
break
if bits:
# reassemble the passed in text and return it.
cell2 = "".join(bits)
return cell2
def getfourfields(line):
symbol = psn = hclass = unno = ""
cells = line.split("<td>")
iii = -1
for cell in cells:
iii += 1
if cell.endswith("</td>"):
if iii == 1:
symbol = cell.replace("</td>", "")
if iii == 2:
psn = cell.replace("</td>", "")
if iii == 3:
hclass = cell.replace("</td>", "")
if iii == 4:
unno = cell.replace("</td>", "")
return symbol, psn, hclass, unno
def labelparens(cell6):
assert cell6
bits = cell6.split(",")
subrisk = ""
if len(bits) >= 2:
for sub in bits[1:len(bits)]:
subrisk = "{0} ({1})".format(subrisk, sub.strip())
cell6 = "{0} {1}".format(bits[0], subrisk.strip())
return cell6
def putfourfields(symbol, psn, hclass, unno, line):
cells = line.split("<td>")
# cells[0] is just <tr>
# we also need to restore the trailing </td>
if len(cells) > 4:
cells[1] = "%s</td>" % symbol
cells[2] = "%s</td>" % psn
cells[3] = "%s</td>" % hclass
cells[4] = "%s</td>" % unno
return "<td>".join(cells)
def uppercase(psn):
""" upper case with twist. If there is a prefix of n- or p- it
must be lower case
"""
psn = psn.upper()
# but if prefixed with n- or p-, restore that as lower case
psns = psn.split("-", 1) # just do one hyphen for now
if len(psns) == 2: # there is a hyphen
if len(psns[0]) <= 2: # a space and n or p
psns[0] = psns[0].lower()
psn = "-".join(psns) # reassemble
return psn
if __name__ == "__main__":
fetchcfr()