diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py index f1f0e191..080eded2 100644 --- a/kingfisher_scrapy/exceptions.py +++ b/kingfisher_scrapy/exceptions.py @@ -10,9 +10,5 @@ class SpiderArgumentError(KingfisherScrapyError): """Raised when a spider argument's value is invalid""" -class MissingRequiredFieldError(KingfisherScrapyError, KeyError): - """Raised when an item is missing a required field""" - - class MissingNextLinkError(KingfisherScrapyError): """Raised when a next link is not found on the first page of results""" diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json new file mode 100644 index 00000000..c726798e --- /dev/null +++ b/kingfisher_scrapy/item_schema/File.json @@ -0,0 +1,20 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "type": "object", + "properties": { + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string" + }, + "files_store": { + "type": "string" + } + } +} diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json new file mode 100644 index 00000000..8f1b935d --- /dev/null +++ b/kingfisher_scrapy/item_schema/FileError.json @@ -0,0 +1,18 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherItem" + } + ], + "type": "object", + "properties": { + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "errors" + ] +} diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json new file mode 100644 index 00000000..3e49413f --- /dev/null +++ b/kingfisher_scrapy/item_schema/FileItem.json @@ -0,0 +1,18 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "type": "object", + "properties": { + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number" + ] +} diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json new file mode 100644 index 00000000..9d96c7f1 --- /dev/null +++ b/kingfisher_scrapy/item_schema/item.json @@ -0,0 +1,63 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "definitions": { + "KingfisherItem": { + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]+$" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": [ + "file_name", + "url" + ] + }, + "KingfisherFileItem": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], + "type": "object", + "properties": { + "data_type": { + "type": "string", + "enum": [ + "record", + "release", + "record_list", + "release_list", + "compiled_release", + "record_package", + "release_package", + "record_package_list", + "release_package_list", + "record_package_list_in_results", + "release_package_list_in_results", + "release_package_json_lines", + "record_package_json_lines", + "release_package_in_ocdsReleasePackage_in_list_in_results", + "release_in_Release" + ] + }, + "encoding": { + "type": "string" + }, + "data": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "data", + "data_type" + ] + } + } +} diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index a05d5c2b..7ce0e95e 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -1,23 +1,12 @@ # https://docs.scrapy.org/en/latest/topics/items.html -import scrapy -from kingfisher_scrapy.exceptions import MissingRequiredFieldError +import scrapy class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - - def validate(self): - """ - Raises an error if any required field is missing. - - :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing - """ - if hasattr(self, 'required'): - for field in self.required: - if field not in self: - raise MissingRequiredFieldError(field) + validate = True class File(KingfisherItem): @@ -32,13 +21,6 @@ class File(KingfisherItem): path = scrapy.Field() files_store = scrapy.Field() - required = [ - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileItem(KingfisherItem): number = scrapy.Field() @@ -46,20 +28,6 @@ class FileItem(KingfisherItem): data_type = scrapy.Field() encoding = scrapy.Field() - required = [ - 'number', - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileError(KingfisherItem): errors = scrapy.Field() - - required = [ - 'file_name', - 'url', - 'errors', - ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 72255757..68172e69 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,18 +1,31 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals + +import os +import pathlib + +import jsonref as jsonref +from jsonschema import FormatChecker +from jsonschema.validators import Draft4Validator + from kingfisher_scrapy.items import File, FileItem class Validate: def __init__(self): + self.validators = {} self.files = set() self.file_items = set() + schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + for item in ('File', 'FileError', 'FileItem'): + filename = os.path.join(schema_path, f'{item}.json') + with open(filename) as f: + schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/') + self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): if hasattr(item, 'validate'): - # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't - # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item.validate() + self.validators.get(item.__class__.__name__).validate(dict(item)) if isinstance(item, FileItem): key = (item['file_name'], item['number']) diff --git a/requirements.in b/requirements.in index cb4eca0d..41899567 100644 --- a/requirements.in +++ b/requirements.in @@ -3,8 +3,11 @@ ijson>=3 jsonpointer +jsonref +jsonschema rarfile requests +rfc3987 Scrapy scrapyd-client sentry-sdk diff --git a/requirements.txt b/requirements.txt index 0bea0b9d..65ebc155 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -attrs==19.3.0 # via automat, service-identity, twisted +attrs==19.3.0 # via automat, jsonschema, service-identity, twisted automat==0.8.0 # via twisted certifi==2019.11.28 # via requests, sentry-sdk cffi==1.13.2 # via cryptography @@ -15,8 +15,11 @@ cssselect==1.1.0 # via parsel, scrapy hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 +importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonref==0.2 +jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy protego==0.1.16 # via scrapy @@ -26,17 +29,20 @@ pycparser==2.19 # via cffi pydispatcher==2.0.5 # via scrapy pyhamcrest==1.9.0 # via twisted pyopenssl==19.1.0 # via scrapy +pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests, sentry-sdk w3lib==1.21.0 # via parsel, scrapy +zipp==3.1.0 # via importlib-metadata zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements_dev.txt b/requirements_dev.txt index 9f58a35a..e010b745 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -21,13 +21,15 @@ flake8==3.7.9 hyperlink==19.0.0 idna==2.8 ijson==3.0.3 -importlib-metadata==1.3.0 # via pluggy, pytest +importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonref==0.2 +jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 -more-itertools==8.0.2 # via pytest, zipp +more-itertools==8.0.2 # via pytest packaging==19.2 # via pytest parsel==1.5.2 pip-tools==5.1.0 @@ -43,11 +45,13 @@ pyflakes==2.1.1 # via flake8 pyhamcrest==1.9.0 pyopenssl==19.1.0 pyparsing==2.4.5 # via packaging +pyrsistent==0.16.0 pytest-cov==2.8.1 pytest==5.3.2 queuelib==1.5.0 rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 @@ -57,7 +61,7 @@ twisted==20.3.0 urllib3==1.25.7 w3lib==1.21.0 wcwidth==0.1.7 # via pytest -zipp==0.6.0 # via importlib-metadata +zipp==3.1.0 zope.interface==4.7.1 # The following packages are considered to be unsafe in a requirements file: diff --git a/tests/test_validate.py b/tests/test_validate.py index a89fa4ba..9dcbca08 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest +from jsonschema import ValidationError -from kingfisher_scrapy.exceptions import MissingRequiredFieldError -from kingfisher_scrapy.items import File, FileItem +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate from tests import spider_with_crawler @@ -9,10 +9,10 @@ def test_process_item(): pipeline = Validate() item = File({ - 'file_name': '', - 'data': '', - 'data_type': '', - 'url': '', + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', }) assert pipeline.process_item(item, None) == item @@ -20,9 +20,69 @@ def test_process_item(): def test_process_item_error(): pipeline = Validate() - item = File() + item = File({ + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + }) + + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['file_name'] = 'test' + item['data_type'] = 'not a valid data type' + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_item(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': 1 + }) + assert pipeline.process_item(item, None) == item + + +def test_process_file_item_error(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': "2" + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['number'] = None + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com', + 'errors': 'Error' + }) + assert pipeline.process_item(item, None) == item + - with pytest.raises(MissingRequiredFieldError): +def test_process_file_item_error_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com' + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['errors'] = 'Error' + item['url'] = 'not an url' + with pytest.raises(ValidationError): pipeline.process_item(item, None) @@ -31,9 +91,9 @@ def test_duplicate_file(caplog): spider = spider_with_crawler() item = File({ 'file_name': 'test1', - 'data': '', - 'data_type': '', - 'url': '', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://example.com', }) pipeline.process_item(item, spider) @@ -51,9 +111,9 @@ def test_duplicate_file_item(caplog): spider = spider_with_crawler() item = FileItem({ 'file_name': 'test1', - 'data': '', - 'data_type': '', - 'url': '', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://example.com', 'number': 1 })