-
Notifications
You must be signed in to change notification settings - Fork 2
/
codes.py
386 lines (309 loc) · 13.3 KB
/
codes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
"""Classes representing legislation."""
from __future__ import annotations
import datetime
import functools
import re
from typing import List, Optional, Sequence, Tuple, Union
from anchorpoint.textselectors import TextQuoteSelector, TextPositionSelector
from bs4 import BeautifulSoup
from authorityspoke.utils.roman import from_roman
class Code:
r"""
A code of legislation.
Could be a constitution, code of statutes, code of regulations,
or collection of court rules.
Each instance of this class depends on an XML file containing
the code, so every new XML format will require adding a method
to this class to ingest it.
:param xml:
A BeautifulSoup object created by parsing the
``Code``\'s XML file
:param title:
A name for the :class:`Code` that may have been found in
an XML ``title`` element. For larger codes like the United
States Code, this should identify the title number, and the
:class:`Code` object will contain text only from that title.
:param uri:
The `United States Legislative Markup (USLM)
<https://github.com/usgpo/uslm>`_ identifier that
describes the document as a whole, if available.
Otherwise, should be a pseudo-USLM identifier.
"""
# namespaces for legislative XML schemas
ns = {
"uslm": "http://xml.house.gov/schemas/uslm/1.0",
"dc": "http://purl.org/dc/elements/1.1/",
"xhtml": "http://www.w3.org/1999/xhtml",
}
def __init__(self, xml, title: str, uri: str):
"""Link an entire XML tree to the Code."""
self.xml = xml
self.title = title
self.uri = uri
@property
def jurisdiction(self) -> str:
"""
Get string representing the jurisdiction from within ``uri``.
:returns:
The abbreviation for the jurisdiction that
enacted the ``Code``, in USLM-like format.
e.g. ``us`` for U.S. federal laws,
``us-ca`` for California state laws.
"""
return self.uri.split("/")[1]
@property
def level(self) -> str:
"""
Get level of legislation for this Code, e.g. "statute".
:returns:
"constitution", "statute", or "regulation"
"""
if "Constitution" in self.title:
return "constitution"
elif "Regulations" in self.title:
return "regulation"
return "statute"
def text_interval(
self, selector: Optional[TextQuoteSelector] = None, path: str = ""
) -> Optional[TextPositionSelector]:
"""
Find integer indices for the quoted text.
:returns:
A :class:`TextPositionSelector` containing the lower and upper bounds of the
text passage quoted in ``self.selector.exact`` within the
XML section referenced in ``self.selector.path``.
"""
sections = self.get_sections(path)
if not sections:
raise ValueError(f"Section {path} does not exist in code {self.title}")
text = self.section_text(sections)
if not selector:
return (0, len(text))
return selector.as_position(text)
def provision_effective_date(self, cite: str) -> datetime.date:
"""Give effective date for a provision within the Code."""
raise NotImplementedError
def get_exact_from_source(
self, source: str, selector: TextQuoteSelector
) -> Optional[str]:
"""
Use ``source`` to find text for ``exact`` parameter.
:param source:
path to a cited section or node, which may contain subsections
:param selector:
selector for the cited text passage within the cited node
"""
sections = self.get_sections(source)
section_text = self.section_text(sections)
return selector.select_text(section_text)
def make_docpath(self, path: str = "") -> str:
"""Remove Code identifier from path to get a path relative to the document."""
docpath = path or self.uri
if not docpath.startswith(self.uri):
return docpath # path could be relative to the Code already
return docpath.replace(self.uri, "")
def _get_sections_from_relative_path(self, docpath: str) -> Optional[BeautifulSoup]:
"""
Get sections using relative path, assuming Code follows USLM standard.
"""
passage_nodes = ["chapeau", "paragraph", "content", "continuation"]
if docpath.endswith(("chapeau", "continuation")):
docpath, suffix = docpath.rsplit("/", maxsplit=1)
passage_nodes = [suffix]
sections = self.xml.find(identifier=docpath)
if not sections:
return None
passages = sections.find_all(passage_nodes)
return passages
def get_sections(self, path: str = "") -> Optional[BeautifulSoup]:
r"""
Get sections identified by a path, if present in the :class:`Code`\.
:param path:
a path string, in the format used for :class:`.Enactment`
objects, to the section with the text to be returned.
:returns:
the text of a section of the :class:`Code`.
"""
docpath = self.make_docpath(path)
if not docpath: # selecting the whole Code
return self.xml.find_all(name="text")
return self._get_sections_from_relative_path(docpath)
@staticmethod
def section_text(passages: Sequence[BeautifulSoup]) -> str:
"""
Get the text of legislative sections from XML elements.
:param passages:
a sequence of XML elements with text to join
:returns:
the text of the XML elements.
"""
return " ".join(" ".join(passage.text.split()) for passage in passages)
def section_text_from_path(self, path: str = "") -> str:
"""
Get the text of legislative sections from a path identifier.
:param path:
a path string, in the format used for :class:`.Enactment`
objects, to the section with the text to be returned.
:returns:
the text of the XML elements.
"""
sections = self.get_sections(path)
return self.section_text(sections)
def select_text_from_interval(
self, interval: TextPositionSelector, path: Optional[str] = None,
) -> Optional[str]:
"""
Select text as interval of section identified by path.
If ``path`` parameter is not given, selects an interval from the
whole :class:`Code`.
:param interval:
the indices of the slice of the section text to be selected.
:para path:
a path to the section that text should be selected from
:returns:
a slice of text from the section identified by ``path``
"""
sections = self.get_sections(path)
if not sections:
raise ValueError(f"Section {path} does not exist in {self}.")
section_text = self.section_text(sections)
return interval.passage(section_text)
def select_text(
self, path: Optional[str], selector: Optional[TextQuoteSelector] = None
) -> Optional[str]:
r"""
Get text from the ``Code`` using a :class:`.TextQuoteSelector`.
:param selector:
a selector referencing a text passage in the ``Code``.
.. note::
When handling Code of Federal Regulation (CFR) :class:`.Enactment`\s,
this can only select from the whole document or from Sections,
not Subsections or any other level. Still hoping to be able
to switch to a `United States Legislative Markup (USLM)
<https://github.com/usgpo/uslm>`_-like XML format for CFR.
:returns:
the text referenced by the selector, or ``None`` if the text
can't be found.
"""
sections = self.get_sections(path)
text = self.section_text(sections)
if not selector:
return text
return selector.select_text(text)
def __repr__(self):
return f"{self.__class__.__name__}({self.title})"
def __str__(self):
return self.title
class USConstCode(Code):
def format_uri_for_const(self, uri: str) -> str:
"""
Convert ``uri`` to identifier for constitution sections.
Although the US Constitution is published in a format
described as USML, its section identifier strings differ from
those in USC USML documents in that they skip the jurisdiction
and code fields, skip the initial slash character,
start with the section field, and convert all remaining
slashes to hyphens.
This will only remove the path to the current code if
the current code is the same one referenced in the URI.
So the example below assumes the current ``Code`` object
has ``self.uri == "/us/const"``
:param uri:
an identifier in a format consistent with USC USML
documents, e.g. ``/us/const/amendment/XIV/1``
:returns:
an identifier in a format found in the USLM version of
the federal constitution, e.g. ``amendment-XIV-1``
"""
return uri.replace(self.uri, "").lstrip("/").replace("/", "-")
def get_fed_const_section(self, path: str) -> BeautifulSoup:
"""
Get a section from a USLM identifier if ``self`` is the US Constitution.
:param path:
a USLM path to a section of the US Constitution, with or
without the part that identifies the :class:`Code`, but without
a namespace declaration.
:returns:
the XML section of ``self`` matching the path.
"""
path = self.format_uri_for_const(path)
path_parts: List[str] = []
for _ in range(path.count("/") + 1):
path_parts.append(path[: (path.index("/") if "/" in path else None)])
path = path.replace("/", "-", 1)
section = self.xml
for part in path_parts:
section = section.find(id=part)
return section
def _get_sections_from_relative_path(self, docpath: str) -> Optional[BeautifulSoup]:
cited_section = self.get_fed_const_section(docpath)
passages = cited_section.find_all(name="text")
return passages
def provision_effective_date(self, cite: str) -> datetime.date:
"""
Give effective date for a provision within the Code.
So far this method only covers the US Constitution and it
assumes that the XML format is `United States Legislative
Markup (USLM) <https://github.com/usgpo/uslm>`_.
:param cite:
a string or selector representing the XML element name for the
the legislative provision within this ``Code``.
:returns:
the effective date of the cited provision
"""
cite = self.format_uri_for_const(cite)
if "amendment" not in cite.lower():
return datetime.date(1788, 9, 13)
roman_numeral = cite.split("-")[1]
amendment_number = from_roman(roman_numeral)
if amendment_number < 11:
return datetime.date(1791, 12, 15)
section = self.xml.find(id=cite)
if section.name == "level":
enactment_text = section.find("note").p.text
else:
enactment_text = section.parent.find("note").p.text
month_first = re.compile(
r"(?:Secretary of State|Administrator of General Services|certificate of the Archivist)"
r"(?: accordingly issued a proclamation)?,? dated (\w+ \d\d?, \d{4}),"
)
day_first = re.compile(
r"(?:Congress|Secretary of State),? dated the (\d\d?th of \w+, \d{4}),"
)
result = month_first.search(enactment_text)
if result:
return datetime.datetime.strptime(result.group(1), "%B %d, %Y").date()
result = day_first.search(enactment_text)
return datetime.datetime.strptime(result.group(1), "%dth of %B, %Y").date()
class USLMCode(Code):
def make_docpath(self, path: str = "") -> str:
"""
Don't remove Code uri from path because USC uses full paths as identifiers.
"""
return path.rstrip("/") or self.uri
class USCCode(USLMCode):
def __str__(self):
return f"USC {self.title}"
class CFRCode(Code):
def _get_sections_from_relative_path(self, docpath: str) -> Optional[BeautifulSoup]:
section = docpath.split("/")[1].strip("s")
citation = self.xml.find(name="SECTNO", text=f"§ {section}")
if not citation:
return None
return citation.parent.find_all(name="P")
class CalCode(Code):
def _get_sections_from_relative_path(self, docpath: str) -> Optional[BeautifulSoup]:
def cal_href(docpath, href):
"""
Test if XML element is labeled as the text of the section in ``docpath``.
Uses `California statute XML format <http://leginfo.legislature.ca.gov/>`_.
"""
section = docpath.split("/")[1].strip("s")
return href and re.compile(
r"^javascript:submitCodesValues\('" + section
).search(href)
this_cal_section = functools.partial(cal_href, docpath)
passages = self.xml.find(href=this_cal_section).parent.find_next_siblings(
style="margin:0;display:inline;"
)
return passages