From 4751253227ea010e12d06ffba87e7dfe4ebec02e Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Fri, 26 Jun 2020 11:01:16 -0400 Subject: [PATCH 1/2] Use pkgutil.get_data to get json schema files Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item.json | 1 + kingfisher_scrapy/pipelines.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 7ded5a014..32021c7f4 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -1,5 +1,6 @@ { "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "item.json", "definitions": { "KingfisherItem": { "type": "object", diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 68172e696..f637b7419 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,27 +1,28 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals +import json +import pkgutil -import os -import pathlib - -import jsonref as jsonref from jsonschema import FormatChecker -from jsonschema.validators import Draft4Validator +from jsonschema.validators import Draft4Validator, RefResolver from kingfisher_scrapy.items import File, FileItem class Validate: def __init__(self): + package_name = 'kingfisher_scrapy' + schema_dir = 'item_schema' self.validators = {} self.files = set() self.file_items = set() - schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + base_json = json.loads(pkgutil.get_data(package_name, f'{schema_dir}/item.json')) + resolver = RefResolver.from_schema(base_json) for item in ('File', 'FileError', 'FileItem'): - filename = os.path.join(schema_path, f'{item}.json') - with open(filename) as f: - schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/') - self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) + f = pkgutil.get_data(package_name, f'{schema_dir}/{item}.json') + relative_schema = json.loads(f) + self.validators[item] = Draft4Validator(relative_schema, + resolver=resolver, format_checker=FormatChecker()) def process_item(self, item, spider): if hasattr(item, 'validate'): From 6f2da050039bc7f1ff480a38a4e514874d1e06b5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 26 Jun 2020 13:37:49 -0400 Subject: [PATCH 2/2] pipelines: Reduce number of local variables --- kingfisher_scrapy/item_schema/item.json | 2 +- kingfisher_scrapy/pipelines.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 32021c7f4..9f6fba66d 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -1,6 +1,6 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", "$id": "item.json", + "$schema": "http://json-schema.org/draft-04/schema#", "definitions": { "KingfisherItem": { "type": "object", diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index f637b7419..b0935f8fb 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -9,20 +9,20 @@ from kingfisher_scrapy.items import File, FileItem +def _json_loads(basename): + return json.loads(pkgutil.get_data('kingfisher_scrapy', f'item_schema/{basename}.json')) + + class Validate: def __init__(self): - package_name = 'kingfisher_scrapy' - schema_dir = 'item_schema' self.validators = {} self.files = set() self.file_items = set() - base_json = json.loads(pkgutil.get_data(package_name, f'{schema_dir}/item.json')) - resolver = RefResolver.from_schema(base_json) + + resolver = RefResolver.from_schema(_json_loads('item')) + checker = FormatChecker() for item in ('File', 'FileError', 'FileItem'): - f = pkgutil.get_data(package_name, f'{schema_dir}/{item}.json') - relative_schema = json.loads(f) - self.validators[item] = Draft4Validator(relative_schema, - resolver=resolver, format_checker=FormatChecker()) + self.validators[item] = Draft4Validator(_json_loads(item), resolver=resolver, format_checker=checker) def process_item(self, item, spider): if hasattr(item, 'validate'):