From 4018ef3d0d4fd8de9d578abf96060652456f799a Mon Sep 17 00:00:00 2001 From: nossbigg Date: Sat, 15 Jun 2019 16:09:43 +0800 Subject: [PATCH] Standardize processElement to only emit Paragraph tuples --- src/common/config.py | 2 +- src/parsers/contentsParser.py | 9 ++++----- src/validators/validators.py | 10 +++------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/common/config.py b/src/common/config.py index 3fef321..e5e305a 100644 --- a/src/common/config.py +++ b/src/common/config.py @@ -7,4 +7,4 @@ ABBREVIATIONS_SAVE_PATH = os.path.join(DATA_SAVE_PATH, "abbreviations.html") JSON_STORE_PATH = os.path.join(DATA_SAVE_PATH, "ccc.json") -JSON_STORE_VERSION = '0.0.1' \ No newline at end of file +JSON_STORE_VERSION = '0.0.2' \ No newline at end of file diff --git a/src/parsers/contentsParser.py b/src/parsers/contentsParser.py index 32cdcbb..a0961c1 100644 --- a/src/parsers/contentsParser.py +++ b/src/parsers/contentsParser.py @@ -17,7 +17,7 @@ def extractStructuredContents(raw_nodes): def processElement(node): if node.name == 'br': - return [createSpacerElement()] + return [createEmptyParagraph()] if node.name != 'p': return [] @@ -83,11 +83,7 @@ def transformCCCReferenceLine(paragraph): def hasCCCReferenceLine(paragraph): - if not isinstance(paragraph, Paragraph): - return False - first_element = paragraph.elements[0] - if 'text' not in first_element: return False @@ -146,6 +142,9 @@ def createParagraph(node, children): return Paragraph(children, attrs) +def createEmptyParagraph(): + return Paragraph([createSpacerElement()], {}) + def isIndentedParagraph(node): style = node.get('style') diff --git a/src/validators/validators.py b/src/validators/validators.py index 984423a..4def093 100644 --- a/src/validators/validators.py +++ b/src/validators/validators.py @@ -1,15 +1,11 @@ -from parsers.contentsParser import Paragraph - - def validate_has_all_ccc_refs(page_nodes_dict): ccc_refs = {} for page in page_nodes_dict.values(): for paragraph in page.paragraphs: - if isinstance(paragraph, Paragraph): - for element in paragraph.elements: - if element['type'] == 'ref-ccc': - ccc_refs[element['ref_number']] = '' + for element in paragraph.elements: + if element['type'] == 'ref-ccc': + ccc_refs[element['ref_number']] = '' expected_num_ccc_refs = 2865 missing_refs = []