From 48fbf86f6692d36b2d9d658bb54bbce1d3e1012e Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:01:58 -0400 Subject: [PATCH 01/19] Update validation pipeline to use a json schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/exceptions.py | 4 - kingfisher_scrapy/item_schema.json | 137 +++++++++++++++++++++++++++++ kingfisher_scrapy/items.py | 16 +--- kingfisher_scrapy/pipelines.py | 19 +++- 4 files changed, 157 insertions(+), 19 deletions(-) create mode 100644 kingfisher_scrapy/item_schema.json diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py index da83a3a5..f69ab051 100644 --- a/kingfisher_scrapy/exceptions.py +++ b/kingfisher_scrapy/exceptions.py @@ -8,7 +8,3 @@ class AuthenticationError(KingfisherScrapyError): class SpiderArgumentError(KingfisherScrapyError): """Raised when a spider argument's value is invalid""" - - -class MissingRequiredFieldError(KingfisherScrapyError, KeyError): - """Raised when an item is missing a required field""" diff --git a/kingfisher_scrapy/item_schema.json b/kingfisher_scrapy/item_schema.json new file mode 100644 index 00000000..f0eb76e2 --- /dev/null +++ b/kingfisher_scrapy/item_schema.json @@ -0,0 +1,137 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "Schema Kingfisher Collect File, FileItem and FileError", + "description": "", + "oneOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/FileItem" + }, + { + "$ref": "#/definitions/FileError" + } + ], + "definitions": + { + "File": { + "title": "File", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string", + "title": "For the KingfisherProcessAPI extension to read the file." + }, + "files_store": { + "type": "string" + } + }, + "required": [ + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileItem": { + "title": "File Item", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + + }, + "data": { + "type": "string", + "minLength": 1 + + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number", + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileError": { + "title": "File Error", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + + }, + "errors": { + "type": "string", + "minLength": 1 + + } + }, + "required": [ + "file_name", + "url", + "errors" + ] + } + } + } diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index a05d5c2b..73da7550 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -1,23 +1,13 @@ # https://docs.scrapy.org/en/latest/topics/items.html -import scrapy -from kingfisher_scrapy.exceptions import MissingRequiredFieldError +import scrapy class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - - def validate(self): - """ - Raises an error if any required field is missing. - - :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing - """ - if hasattr(self, 'required'): - for field in self.required: - if field not in self: - raise MissingRequiredFieldError(field) + # indicate that this item should be validated against a schema + validate = True class File(KingfisherItem): diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 2fc03415..d6338d9f 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,12 +1,27 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals +import json +import os + +from jsonschema import FormatChecker +from jsonschema.validators import Draft4Validator class Validate: + def __init__(self): + here = os.path.dirname(os.path.abspath(__file__)) + filename = os.path.join(here, 'item_schema.json') + with open(filename) as f: + schema = json.load(f) + + self.validator = Draft4Validator(schema, format_checker=FormatChecker()) + def process_item(self, item, spider): if hasattr(item, 'validate'): # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item.validate() - + item_str = json.dumps(item.__dict__) + json_item = json.loads(item_str)['_values'] + print(json_item) + self.validator.validate(json_item) return item From 2b2be515c1f1e0c7f4639e2584edc8d91d4e47c3 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:36:59 -0400 Subject: [PATCH 02/19] Update test and json schema to include all data types Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema.json | 137 ---------------- kingfisher_scrapy/item_schema/dataType.csv | 16 ++ .../item_schema/item_schema.json | 147 ++++++++++++++++++ kingfisher_scrapy/pipelines.py | 2 +- tests/test_validate.py | 12 +- 5 files changed, 170 insertions(+), 144 deletions(-) delete mode 100644 kingfisher_scrapy/item_schema.json create mode 100644 kingfisher_scrapy/item_schema/dataType.csv create mode 100644 kingfisher_scrapy/item_schema/item_schema.json diff --git a/kingfisher_scrapy/item_schema.json b/kingfisher_scrapy/item_schema.json deleted file mode 100644 index f0eb76e2..00000000 --- a/kingfisher_scrapy/item_schema.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema Kingfisher Collect File, FileItem and FileError", - "description": "", - "oneOf": [ - { - "$ref": "#/definitions/File" - }, - { - "$ref": "#/definitions/FileItem" - }, - { - "$ref": "#/definitions/FileError" - } - ], - "definitions": - { - "File": { - "title": "File", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, - "data": { - "type": "string", - "minLength": 1 - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, - "post_to_api": { - "type": "boolean" - }, - "path": { - "type": "string", - "title": "For the KingfisherProcessAPI extension to read the file." - }, - "files_store": { - "type": "string" - } - }, - "required": [ - "file_name", - "url", - "data", - "data_type" - ] - }, - "FileItem": { - "title": "File Item", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - - }, - "data": { - "type": "string", - "minLength": 1 - - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, - "number": { - "type": "integer", - "minimum": 1 - } - }, - "required": [ - "number", - "file_name", - "url", - "data", - "data_type" - ] - }, - "FileError": { - "title": "File Error", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - - }, - "errors": { - "type": "string", - "minLength": 1 - - } - }, - "required": [ - "file_name", - "url", - "errors" - ] - } - } - } diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv new file mode 100644 index 00000000..0e276db0 --- /dev/null +++ b/kingfisher_scrapy/item_schema/dataType.csv @@ -0,0 +1,16 @@ +Code,Title,Description +record,Record, +release,Release, +record_list,Record List, +release_list,Release List, +compiled_release,Compiled Release, +record_package,Record Package, +release_package,Release Package, +record_package_list,Record Package List +release_package_list,Release Package List, +record_package_list_in_results,Record Package List in Results, +release_package_list_in_results,Release Package List in Results, +release_package_json_lines,Release Package Json Lines, +record_package_json_lines,Record Package Json Lines, +release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results, +release_in_Release, Release in Release, diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json new file mode 100644 index 00000000..b8285ccb --- /dev/null +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -0,0 +1,147 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "Schema Kingfisher Collect File, FileItem and FileError", + "description": "", + "oneOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/FileItem" + }, + { + "$ref": "#/definitions/FileError" + } + ], + "definitions": { + "File": { + "title": "File", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "record", + "release", + "record_list", + "release_list", + "compiled_release", + "record_package", + "release_package", + "record_package_list", + "release_package_list", + "record_package_list_in_results", + "release_package_list_in_results", + "release_package_json_lines", + "record_package_json_lines", + "release_package_in_ocdsReleasePackage_in_list_in_results", + "release_in_Release" + ], + "openCodelist": false, + "codelist": "dataType.csv", + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string", + "title": "For the KingfisherProcessAPI extension to read the file." + }, + "files_store": { + "type": "string" + } + }, + "required": [ + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileItem": { + "title": "File Item", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number", + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileError": { + "title": "File Error", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "file_name", + "url", + "errors" + ] + } + } +} diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index d6338d9f..e737481e 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -10,7 +10,7 @@ class Validate: def __init__(self): here = os.path.dirname(os.path.abspath(__file__)) - filename = os.path.join(here, 'item_schema.json') + filename = os.path.join(here, 'item_schema', 'item_schema.json') with open(filename) as f: schema = json.load(f) diff --git a/tests/test_validate.py b/tests/test_validate.py index 9ebac699..da800deb 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,6 +1,6 @@ import pytest +from jsonschema import ValidationError -from kingfisher_scrapy.exceptions import MissingRequiredFieldError from kingfisher_scrapy.items import File from kingfisher_scrapy.pipelines import Validate @@ -8,10 +8,10 @@ def test_process_item(): pipeline = Validate() item = File({ - 'file_name': '', - 'data': '', - 'data_type': '', - 'url': '', + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', }) assert pipeline.process_item(item, None) == item @@ -21,5 +21,5 @@ def test_process_item_error(): pipeline = Validate() item = File() - with pytest.raises(MissingRequiredFieldError): + with pytest.raises(ValidationError): pipeline.process_item(item, None) From eb3482b5e5bb63e21daea206d2e02f2af7823a25 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:37:40 -0400 Subject: [PATCH 03/19] Add jsonschema to requirements Signed-off-by: Yohanna Lisnichuk --- requirements.in | 3 ++- requirements.txt | 8 ++++++-- requirements_dev.txt | 8 +++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/requirements.in b/requirements.in index f791df77..0300c24d 100644 --- a/requirements.in +++ b/requirements.in @@ -7,4 +7,5 @@ requests Scrapy scrapyd-client ijson>=3 -sentry-sdk \ No newline at end of file +sentry-sdk +jsonschema \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0bea0b9d..39e3a499 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -attrs==19.3.0 # via automat, service-identity, twisted +attrs==19.3.0 # via automat, jsonschema, service-identity, twisted automat==0.8.0 # via twisted certifi==2019.11.28 # via requests, sentry-sdk cffi==1.13.2 # via cryptography @@ -15,8 +15,10 @@ cssselect==1.1.0 # via parsel, scrapy hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 +importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy protego==0.1.16 # via scrapy @@ -26,6 +28,7 @@ pycparser==2.19 # via cffi pydispatcher==2.0.5 # via scrapy pyhamcrest==1.9.0 # via twisted pyopenssl==19.1.0 # via scrapy +pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 @@ -33,10 +36,11 @@ scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests, sentry-sdk w3lib==1.21.0 # via parsel, scrapy +zipp==3.1.0 # via importlib-metadata zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements_dev.txt b/requirements_dev.txt index 9f58a35a..06b5256d 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -21,13 +21,14 @@ flake8==3.7.9 hyperlink==19.0.0 idna==2.8 ijson==3.0.3 -importlib-metadata==1.3.0 # via pluggy, pytest +importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 -more-itertools==8.0.2 # via pytest, zipp +more-itertools==8.0.2 # via pytest packaging==19.2 # via pytest parsel==1.5.2 pip-tools==5.1.0 @@ -43,6 +44,7 @@ pyflakes==2.1.1 # via flake8 pyhamcrest==1.9.0 pyopenssl==19.1.0 pyparsing==2.4.5 # via packaging +pyrsistent==0.16.0 pytest-cov==2.8.1 pytest==5.3.2 queuelib==1.5.0 @@ -57,7 +59,7 @@ twisted==20.3.0 urllib3==1.25.7 w3lib==1.21.0 wcwidth==0.1.7 # via pytest -zipp==0.6.0 # via importlib-metadata +zipp==3.1.0 zope.interface==4.7.1 # The following packages are considered to be unsafe in a requirements file: From 69f951066693a83091ea795143fbbaa2b3c8bf44 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:47:11 -0400 Subject: [PATCH 04/19] Update schema to re use definitions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 47 ++++++++++--------- kingfisher_scrapy/pipelines.py | 1 - 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index b8285ccb..441b934d 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -14,8 +14,8 @@ } ], "definitions": { - "File": { - "title": "File", + "KingfisherItem": { + "title": "Kingfisher Item", "type": "object", "properties": { "file_name": { @@ -27,7 +27,18 @@ "type": "string", "format": "uri", "minLength": 1 - }, + } + } + }, + "File": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], + "title": "File", + "type": "object", + "properties": { "data": { "type": "string", "minLength": 1 @@ -77,19 +88,14 @@ ] }, "FileItem": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], "title": "File Item", "type": "object", "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, "data": { "type": "string", "minLength": 1 @@ -119,19 +125,14 @@ ] }, "FileError": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], "title": "File Error", "type": "object", "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, "errors": { "type": "string", "minLength": 1 diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index e737481e..4fe2f587 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -22,6 +22,5 @@ def process_item(self, item, spider): # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. item_str = json.dumps(item.__dict__) json_item = json.loads(item_str)['_values'] - print(json_item) self.validator.validate(json_item) return item From 87f832584e71f9ca1b77c82548a168852eda4859 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:10:57 -0400 Subject: [PATCH 05/19] Add descriptions to schema codelist and correct format Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/dataType.csv | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv index 0e276db0..7d8eac30 100644 --- a/kingfisher_scrapy/item_schema/dataType.csv +++ b/kingfisher_scrapy/item_schema/dataType.csv @@ -1,16 +1,16 @@ Code,Title,Description -record,Record, -release,Release, -record_list,Record List, -release_list,Release List, -compiled_release,Compiled Release, -record_package,Record Package, -release_package,Release Package, -record_package_list,Record Package List -release_package_list,Release Package List, -record_package_list_in_results,Record Package List in Results, -release_package_list_in_results,Release Package List in Results, -release_package_json_lines,Release Package Json Lines, -record_package_json_lines,Record Package Json Lines, -release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results, -release_in_Release, Release in Release, +record,Record,A record object +release,Release,A release object +record_list,Record List,An array of records +release_list,Release List,An array of releases +compiled_release,Compiled Release,A compiled release +record_package,Record Package,A record package +release_package,Release Package,A release package +record_package_list,Record Package List,An array of record packages +release_package_list,Release Package List,An array of release packages +record_package_list_in_results,Record Package List in Results,An array of record packages inside a results field +release_package_list_in_results,Release Package List in Results,An array of release packages inside a results field +release_package_json_lines,Release Package Json Lines,A json lines file with release packages +record_package_json_lines,Record Package Json Lines,A json lines file with record packages +release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results,A release package inside a ocdsReleasePackage object inside a results array +release_in_Release,Release in Release,A release in a Release object From f87d568f638e41741d5261d2c670e0ee4c5bd172 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:44:50 -0400 Subject: [PATCH 06/19] Update schema to refactor definitions and add titles and descriptions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 85 ++++++++++++------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 441b934d..5b003a0e 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -19,31 +19,33 @@ "type": "object", "properties": { "file_name": { + "title": "File Name", + "description": "File Name", "type": "string", "pattern": "^[^/]*$", "minLength": 1 }, "url": { + "title": "URL", + "description": "URL", "type": "string", "format": "uri", "minLength": 1 } } }, - "File": { + "KingfisherFileItem": { + "title": "Kingfisher Item", + "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], - "title": "File", - "type": "object", "properties": { - "data": { - "type": "string", - "minLength": 1 - }, "data_type": { + "title": "Data Type", + "description": "Data Type", "type": "string", "enum": [ "record", @@ -67,17 +69,53 @@ "minLength": 1 }, "encoding": { - "type": "string" + "title": "Encoding", + "description": "Encoding", + "type": [ + "string", + "null" + ] }, + "data": { + "title": "Data", + "description": "Data", + "type": "string", + "minLength": 1 + } + } + }, + "File": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherFileItem" + } + ], + "title": "File", + "type": "object", + "properties": { "post_to_api": { - "type": "boolean" + "title": "Post to Api?", + "description": "Post to Api?", + "type": [ + "boolean", + "null" + ] }, "path": { - "type": "string", - "title": "For the KingfisherProcessAPI extension to read the file." + "description": "For the KingfisherProcessAPI extension to read the file.", + "type": [ + "string", + "null" + ], + "title": "Path" }, "files_store": { - "type": "string" + "title": "Files Store", + "description": "Files Store", + "type": [ + "string", + "null" + ] } }, "required": [ @@ -90,27 +128,12 @@ "FileItem": { "allOf": [ { - "$ref": "#/definitions/KingfisherItem" + "$ref": "#/definitions/KingfisherFileItem" } ], "title": "File Item", "type": "object", "properties": { - "data": { - "type": "string", - "minLength": 1 - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, "number": { "type": "integer", "minimum": 1 @@ -125,15 +148,17 @@ ] }, "FileError": { + "title": "File Error", + "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], - "title": "File Error", - "type": "object", "properties": { "errors": { + "title": "Errors", + "description": "Errors", "type": "string", "minLength": 1 } From 28a0613dd0b0c02a074a535afabc2c70b85f87f5 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:59:46 -0400 Subject: [PATCH 07/19] Update schema descriptions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 5b003a0e..d2536994 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema Kingfisher Collect File, FileItem and FileError", - "description": "", + "title": "Schema for Kingfisher Collect File, FileItem and FileError", + "description": "Schema for Kingfisher Collect File, FileItem and FileError", "oneOf": [ { "$ref": "#/definitions/File" @@ -16,6 +16,7 @@ "definitions": { "KingfisherItem": { "title": "Kingfisher Item", + "description": "A generic item with file_name and url to be extended by other items", "type": "object", "properties": { "file_name": { @@ -32,10 +33,15 @@ "format": "uri", "minLength": 1 } - } + }, + "required": [ + "file_name", + "url" + ] }, "KingfisherFileItem": { "title": "Kingfisher Item", + "description": "A base object to be extended by other File type items", "type": "object", "allOf": [ { @@ -91,6 +97,7 @@ } ], "title": "File", + "description": "A file object to be send to an API and/or saved to the disk", "type": "object", "properties": { "post_to_api": { @@ -117,13 +124,7 @@ "null" ] } - }, - "required": [ - "file_name", - "url", - "data", - "data_type" - ] + } }, "FileItem": { "allOf": [ @@ -132,6 +133,7 @@ } ], "title": "File Item", + "description": "A file item to be send to an API and not saved to the disk", "type": "object", "properties": { "number": { @@ -140,15 +142,12 @@ } }, "required": [ - "number", - "file_name", - "url", - "data", - "data_type" + "number" ] }, "FileError": { "title": "File Error", + "description": "An item to report and error", "type": "object", "allOf": [ { @@ -164,8 +163,6 @@ } }, "required": [ - "file_name", - "url", "errors" ] } From 6c45dfa9e8e86e4354191dc0f3e109817190bd3c Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 22 Jun 2020 11:46:05 -0400 Subject: [PATCH 08/19] Update item schema with required fields Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item_schema.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index d2536994..050a6d0d 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -88,7 +88,11 @@ "type": "string", "minLength": 1 } - } + }, + "required": [ + "data", + "data_type" + ] }, "File": { "allOf": [ From b195529b20a557b3a9ff78f8188a198259cacafc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 22 Jun 2020 11:51:27 -0400 Subject: [PATCH 09/19] Update item schema to include number field title and description Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item_schema.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 050a6d0d..26f61cb8 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -141,6 +141,8 @@ "type": "object", "properties": { "number": { + "title": "Item number", + "description": "Item number", "type": "integer", "minimum": 1 } From d95c426f2d5507c59068e6f49bfe94b1377a9986 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:09:19 -0400 Subject: [PATCH 10/19] Add json schema validator requirements, rename schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/dataType.csv | 16 --------- .../{item_schema.json => item.json} | 36 ++++--------------- requirements.in | 6 ++-- requirements.txt | 2 ++ requirements_dev.txt | 2 ++ 5 files changed, 15 insertions(+), 47 deletions(-) delete mode 100644 kingfisher_scrapy/item_schema/dataType.csv rename kingfisher_scrapy/item_schema/{item_schema.json => item.json} (83%) diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv deleted file mode 100644 index 7d8eac30..00000000 --- a/kingfisher_scrapy/item_schema/dataType.csv +++ /dev/null @@ -1,16 +0,0 @@ -Code,Title,Description -record,Record,A record object -release,Release,A release object -record_list,Record List,An array of records -release_list,Release List,An array of releases -compiled_release,Compiled Release,A compiled release -record_package,Record Package,A record package -release_package,Release Package,A release package -record_package_list,Record Package List,An array of record packages -release_package_list,Release Package List,An array of release packages -record_package_list_in_results,Record Package List in Results,An array of record packages inside a results field -release_package_list_in_results,Release Package List in Results,An array of release packages inside a results field -release_package_json_lines,Release Package Json Lines,A json lines file with release packages -record_package_json_lines,Record Package Json Lines,A json lines file with record packages -release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results,A release package inside a ocdsReleasePackage object inside a results array -release_in_Release,Release in Release,A release in a Release object diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item.json similarity index 83% rename from kingfisher_scrapy/item_schema/item_schema.json rename to kingfisher_scrapy/item_schema/item.json index 26f61cb8..03dfcfc9 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -1,18 +1,5 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for Kingfisher Collect File, FileItem and FileError", - "description": "Schema for Kingfisher Collect File, FileItem and FileError", - "oneOf": [ - { - "$ref": "#/definitions/File" - }, - { - "$ref": "#/definitions/FileItem" - }, - { - "$ref": "#/definitions/FileError" - } - ], "definitions": { "KingfisherItem": { "title": "Kingfisher Item", @@ -23,15 +10,13 @@ "title": "File Name", "description": "File Name", "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 + "pattern": "^[^/]+$" }, "url": { "title": "URL", "description": "URL", "type": "string", - "format": "uri", - "minLength": 1 + "format": "uri" } }, "required": [ @@ -69,17 +54,13 @@ "record_package_json_lines", "release_package_in_ocdsReleasePackage_in_list_in_results", "release_in_Release" - ], - "openCodelist": false, - "codelist": "dataType.csv", - "minLength": 1 + ] }, "encoding": { "title": "Encoding", "description": "Encoding", "type": [ - "string", - "null" + "string" ] }, "data": { @@ -108,15 +89,13 @@ "title": "Post to Api?", "description": "Post to Api?", "type": [ - "boolean", - "null" + "boolean" ] }, "path": { "description": "For the KingfisherProcessAPI extension to read the file.", "type": [ - "string", - "null" + "string" ], "title": "Path" }, @@ -124,8 +103,7 @@ "title": "Files Store", "description": "Files Store", "type": [ - "string", - "null" + "string" ] } } diff --git a/requirements.in b/requirements.in index 0300c24d..5a1de268 100644 --- a/requirements.in +++ b/requirements.in @@ -2,10 +2,12 @@ # https://github.com/open-contracting/deploy/blob/master/salt/ocdskingfishercollect/scrapyd-requirements.txt jsonpointer +jsonref +jsonschema rarfile requests +rfc3987 Scrapy scrapyd-client ijson>=3 -sentry-sdk -jsonschema \ No newline at end of file +sentry-sdk \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 39e3a499..65ebc155 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ ijson==3.0.3 importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonref==0.2 jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy @@ -32,6 +33,7 @@ pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 diff --git a/requirements_dev.txt b/requirements_dev.txt index 06b5256d..e010b745 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -25,6 +25,7 @@ importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonref==0.2 jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 @@ -50,6 +51,7 @@ pytest==5.3.2 queuelib==1.5.0 rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 From a7b0ce18368c156aec1e53dadc48b8468b3c64af Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:09:50 -0400 Subject: [PATCH 11/19] Add a schema file per item class Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/file.json | 28 ++++++ kingfisher_scrapy/item_schema/file_error.json | 19 +++++ kingfisher_scrapy/item_schema/file_item.json | 19 +++++ kingfisher_scrapy/item_schema/item.json | 85 ------------------- 4 files changed, 66 insertions(+), 85 deletions(-) create mode 100644 kingfisher_scrapy/item_schema/file.json create mode 100644 kingfisher_scrapy/item_schema/file_error.json create mode 100644 kingfisher_scrapy/item_schema/file_item.json diff --git a/kingfisher_scrapy/item_schema/file.json b/kingfisher_scrapy/item_schema/file.json new file mode 100644 index 00000000..2f755d50 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file.json @@ -0,0 +1,28 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "title": "File", + "type": "object", + "properties": { + "post_to_api": { + "type": [ + "boolean" + ] + }, + "path": { + "description": "For the KingfisherProcessAPI extension to read the file.", + "type": [ + "string" + ] + }, + "files_store": { + "type": [ + "string" + ] + } + } +} \ No newline at end of file diff --git a/kingfisher_scrapy/item_schema/file_error.json b/kingfisher_scrapy/item_schema/file_error.json new file mode 100644 index 00000000..d0ba0ee3 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file_error.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "File Error", + "type": "object", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherItem" + } + ], + "properties": { + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "errors" + ] +} diff --git a/kingfisher_scrapy/item_schema/file_item.json b/kingfisher_scrapy/item_schema/file_item.json new file mode 100644 index 00000000..4bbbc119 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file_item.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "title": "File Item", + "type": "object", + "properties": { + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number" + ] +} \ No newline at end of file diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 03dfcfc9..1f343907 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -7,14 +7,10 @@ "type": "object", "properties": { "file_name": { - "title": "File Name", - "description": "File Name", "type": "string", "pattern": "^[^/]+$" }, "url": { - "title": "URL", - "description": "URL", "type": "string", "format": "uri" } @@ -35,8 +31,6 @@ ], "properties": { "data_type": { - "title": "Data Type", - "description": "Data Type", "type": "string", "enum": [ "record", @@ -57,15 +51,11 @@ ] }, "encoding": { - "title": "Encoding", - "description": "Encoding", "type": [ "string" ] }, "data": { - "title": "Data", - "description": "Data", "type": "string", "minLength": 1 } @@ -74,81 +64,6 @@ "data", "data_type" ] - }, - "File": { - "allOf": [ - { - "$ref": "#/definitions/KingfisherFileItem" - } - ], - "title": "File", - "description": "A file object to be send to an API and/or saved to the disk", - "type": "object", - "properties": { - "post_to_api": { - "title": "Post to Api?", - "description": "Post to Api?", - "type": [ - "boolean" - ] - }, - "path": { - "description": "For the KingfisherProcessAPI extension to read the file.", - "type": [ - "string" - ], - "title": "Path" - }, - "files_store": { - "title": "Files Store", - "description": "Files Store", - "type": [ - "string" - ] - } - } - }, - "FileItem": { - "allOf": [ - { - "$ref": "#/definitions/KingfisherFileItem" - } - ], - "title": "File Item", - "description": "A file item to be send to an API and not saved to the disk", - "type": "object", - "properties": { - "number": { - "title": "Item number", - "description": "Item number", - "type": "integer", - "minimum": 1 - } - }, - "required": [ - "number" - ] - }, - "FileError": { - "title": "File Error", - "description": "An item to report and error", - "type": "object", - "allOf": [ - { - "$ref": "#/definitions/KingfisherItem" - } - ], - "properties": { - "errors": { - "title": "Errors", - "description": "Errors", - "type": "string", - "minLength": 1 - } - }, - "required": [ - "errors" - ] } } } From 1d95007dc4bd0309a6ae6537c1427d3563beb0e9 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:10:55 -0400 Subject: [PATCH 12/19] Update validation method to use a schema per item class Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/items.py | 22 ---------------------- kingfisher_scrapy/pipelines.py | 23 +++++++++++------------ 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index 73da7550..7ce0e95e 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -6,7 +6,6 @@ class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - # indicate that this item should be validated against a schema validate = True @@ -22,13 +21,6 @@ class File(KingfisherItem): path = scrapy.Field() files_store = scrapy.Field() - required = [ - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileItem(KingfisherItem): number = scrapy.Field() @@ -36,20 +28,6 @@ class FileItem(KingfisherItem): data_type = scrapy.Field() encoding = scrapy.Field() - required = [ - 'number', - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileError(KingfisherItem): errors = scrapy.Field() - - required = [ - 'file_name', - 'url', - 'errors', - ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 4fe2f587..a477bf14 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,26 +1,25 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals -import json import os +import pathlib +import jsonref as jsonref from jsonschema import FormatChecker from jsonschema.validators import Draft4Validator class Validate: def __init__(self): - here = os.path.dirname(os.path.abspath(__file__)) - filename = os.path.join(here, 'item_schema', 'item_schema.json') - with open(filename) as f: - schema = json.load(f) - - self.validator = Draft4Validator(schema, format_checker=FormatChecker()) + self.validators = {} + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + for item in ['file', 'file_error', 'file_item']: + filename = os.path.join(schema_path, f'{item}.json') + with open(filename) as f: + schema = jsonref.load(f, base_uri=pathlib.Path(os.path.join(schema_path), 'item_schema').as_uri()) + class_name = ''.join(word.title() for word in item.split('_')) + self.validators[class_name] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): if hasattr(item, 'validate'): - # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't - # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item_str = json.dumps(item.__dict__) - json_item = json.loads(item_str)['_values'] - self.validator.validate(json_item) + self.validators.get(item.__class__.__name__).validate(dict(item)) return item From d4079efc86c8c6cee870cfcd40d73b26b23422ac Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:11:16 -0400 Subject: [PATCH 13/19] Add a test per item class Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index da800deb..2d90bb5a 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File +from kingfisher_scrapy.items import File, FileItem, FileError from kingfisher_scrapy.pipelines import Validate @@ -19,7 +19,67 @@ def test_process_item(): def test_process_item_error(): pipeline = Validate() - item = File() + item = File({ + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + }) + + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['file_name'] = 'test' + item['data_type'] = 'not a valid data type' + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_item(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': 1 + }) + assert pipeline.process_item(item, None) == item + + +def test_process_file_item_error(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': "2" + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['number'] = None + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com', + 'errors': 'Error' + }) + assert pipeline.process_item(item, None) == item + +def test_process_file_item_error_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com' + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['errors'] = 'Error' + item['url'] = 'not an url' with pytest.raises(ValidationError): pipeline.process_item(item, None) From d7213d3e6b672f951177e2cf5320f1b525623216 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:20:08 -0400 Subject: [PATCH 14/19] isort test_validate.py Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 2d90bb5a..dd37f5b2 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File, FileItem, FileError +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate From 895e68ea69ff4db8fdde3c737f5547654e439d68 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:27:52 -0400 Subject: [PATCH 15/19] Json schemas correct indentation Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/file.json | 2 +- kingfisher_scrapy/item_schema/file_item.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/item_schema/file.json b/kingfisher_scrapy/item_schema/file.json index 2f755d50..05918da3 100644 --- a/kingfisher_scrapy/item_schema/file.json +++ b/kingfisher_scrapy/item_schema/file.json @@ -25,4 +25,4 @@ ] } } -} \ No newline at end of file +} diff --git a/kingfisher_scrapy/item_schema/file_item.json b/kingfisher_scrapy/item_schema/file_item.json index 4bbbc119..ab1fcba8 100644 --- a/kingfisher_scrapy/item_schema/file_item.json +++ b/kingfisher_scrapy/item_schema/file_item.json @@ -16,4 +16,4 @@ "required": [ "number" ] -} \ No newline at end of file +} From 126c75458c10eba62e9dcd7bcaa901857473329b Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 24 Jun 2020 11:24:46 -0400 Subject: [PATCH 16/19] Update validate pipeline Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/File.json | 1 - kingfisher_scrapy/pipelines.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json index 05918da3..f07c411d 100644 --- a/kingfisher_scrapy/item_schema/File.json +++ b/kingfisher_scrapy/item_schema/File.json @@ -14,7 +14,6 @@ ] }, "path": { - "description": "For the KingfisherProcessAPI extension to read the file.", "type": [ "string" ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index d0a3e912..98e34cae 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -16,11 +16,11 @@ def __init__(self): self.validators = {} self.files = set() self.file_items = set() - schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') for item in ['File', 'FileError', 'FileItem']: filename = os.path.join(schema_path, f'{item}.json') with open(filename) as f: - schema = jsonref.load(f, base_uri=pathlib.Path(schema_path).as_uri()) + schema = jsonref.load(f, base_uri=pathlib.Path(schema_path, 'item_schema').as_uri()) self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): From 633bc1cca0f3d363cc72b8a4ea714549eccf66da Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 24 Jun 2020 11:33:39 -0400 Subject: [PATCH 17/19] isort test_validate.py Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 8675b6aa..9dcbca08 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,8 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File, FileItem -from kingfisher_scrapy.items import FileError +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate from tests import spider_with_crawler From be2da56988e5e7d0c52da933581dd50bb2de12b0 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 24 Jun 2020 12:17:59 -0400 Subject: [PATCH 18/19] Remove remaining title/description metadata properties. Change "type" values from arrays to strings. --- kingfisher_scrapy/item_schema/File.json | 13 +++---------- kingfisher_scrapy/item_schema/FileError.json | 3 +-- kingfisher_scrapy/item_schema/FileItem.json | 1 - kingfisher_scrapy/item_schema/item.json | 10 ++-------- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json index f07c411d..c726798e 100644 --- a/kingfisher_scrapy/item_schema/File.json +++ b/kingfisher_scrapy/item_schema/File.json @@ -5,23 +5,16 @@ "$ref": "item.json#/definitions/KingfisherFileItem" } ], - "title": "File", "type": "object", "properties": { "post_to_api": { - "type": [ - "boolean" - ] + "type": "boolean" }, "path": { - "type": [ - "string" - ] + "type": "string" }, "files_store": { - "type": [ - "string" - ] + "type": "string" } } } diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json index d0ba0ee3..8f1b935d 100644 --- a/kingfisher_scrapy/item_schema/FileError.json +++ b/kingfisher_scrapy/item_schema/FileError.json @@ -1,12 +1,11 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "File Error", - "type": "object", "allOf": [ { "$ref": "item.json#/definitions/KingfisherItem" } ], + "type": "object", "properties": { "errors": { "type": "string", diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json index ab1fcba8..3e49413f 100644 --- a/kingfisher_scrapy/item_schema/FileItem.json +++ b/kingfisher_scrapy/item_schema/FileItem.json @@ -5,7 +5,6 @@ "$ref": "item.json#/definitions/KingfisherFileItem" } ], - "title": "File Item", "type": "object", "properties": { "number": { diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 1f343907..9d96c7f1 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -2,8 +2,6 @@ "$schema": "http://json-schema.org/draft-04/schema#", "definitions": { "KingfisherItem": { - "title": "Kingfisher Item", - "description": "A generic item with file_name and url to be extended by other items", "type": "object", "properties": { "file_name": { @@ -21,14 +19,12 @@ ] }, "KingfisherFileItem": { - "title": "Kingfisher Item", - "description": "A base object to be extended by other File type items", - "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], + "type": "object", "properties": { "data_type": { "type": "string", @@ -51,9 +47,7 @@ ] }, "encoding": { - "type": [ - "string" - ] + "type": "string" }, "data": { "type": "string", From 74ac039f8b6cb91f9385ecfd9ec651cda2efa9e9 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 24 Jun 2020 12:24:45 -0400 Subject: [PATCH 19/19] pipelines: Add trailing slash to URI so that last component is not removed during dereferencing --- kingfisher_scrapy/pipelines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 98e34cae..68172e69 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -17,10 +17,10 @@ def __init__(self): self.files = set() self.file_items = set() schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') - for item in ['File', 'FileError', 'FileItem']: + for item in ('File', 'FileError', 'FileItem'): filename = os.path.join(schema_path, f'{item}.json') with open(filename) as f: - schema = jsonref.load(f, base_uri=pathlib.Path(schema_path, 'item_schema').as_uri()) + schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/') self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider):