Skip to content

Commit

Permalink
Standardize processElement to only emit Paragraph tuples
Browse files Browse the repository at this point in the history
  • Loading branch information
nossbigg committed Aug 24, 2019
1 parent 8034f8c commit 4018ef3
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
ABBREVIATIONS_SAVE_PATH = os.path.join(DATA_SAVE_PATH, "abbreviations.html")

JSON_STORE_PATH = os.path.join(DATA_SAVE_PATH, "ccc.json")
JSON_STORE_VERSION = '0.0.1'
JSON_STORE_VERSION = '0.0.2'
9 changes: 4 additions & 5 deletions src/parsers/contentsParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def extractStructuredContents(raw_nodes):

def processElement(node):
if node.name == 'br':
return [createSpacerElement()]
return [createEmptyParagraph()]

if node.name != 'p':
return []
Expand Down Expand Up @@ -83,11 +83,7 @@ def transformCCCReferenceLine(paragraph):


def hasCCCReferenceLine(paragraph):
if not isinstance(paragraph, Paragraph):
return False

first_element = paragraph.elements[0]

if 'text' not in first_element:
return False

Expand Down Expand Up @@ -146,6 +142,9 @@ def createParagraph(node, children):

return Paragraph(children, attrs)

def createEmptyParagraph():
return Paragraph([createSpacerElement()], {})


def isIndentedParagraph(node):
style = node.get('style')
Expand Down
10 changes: 3 additions & 7 deletions src/validators/validators.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from parsers.contentsParser import Paragraph


def validate_has_all_ccc_refs(page_nodes_dict):
ccc_refs = {}

for page in page_nodes_dict.values():
for paragraph in page.paragraphs:
if isinstance(paragraph, Paragraph):
for element in paragraph.elements:
if element['type'] == 'ref-ccc':
ccc_refs[element['ref_number']] = ''
for element in paragraph.elements:
if element['type'] == 'ref-ccc':
ccc_refs[element['ref_number']] = ''

expected_num_ccc_refs = 2865
missing_refs = []
Expand Down

0 comments on commit 4018ef3

Please sign in to comment.