Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update validation pipeline to use a json schema #420

Merged
merged 22 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
48fbf86
Update validation pipeline to use a json schema
yolile Jun 18, 2020
2b2be51
Update test and json schema to include all data types
yolile Jun 18, 2020
eb3482b
Add jsonschema to requirements
yolile Jun 18, 2020
69f9510
Update schema to re use definitions
yolile Jun 18, 2020
87f8325
Add descriptions to schema codelist and correct format
yolile Jun 18, 2020
f87d568
Update schema to refactor definitions and add titles and descriptions
yolile Jun 18, 2020
28a0613
Update schema descriptions
yolile Jun 18, 2020
e3f20d3
Merge branch 'master' of github.com:open-contracting/kingfisher-scrap…
yolile Jun 22, 2020
6c45dfa
Update item schema with required fields
yolile Jun 22, 2020
b195529
Update item schema to include number field title and description
yolile Jun 22, 2020
d95c426
Add json schema validator requirements, rename schema
yolile Jun 23, 2020
a7b0ce1
Add a schema file per item class
yolile Jun 23, 2020
1d95007
Update validation method to use a schema per item class
yolile Jun 23, 2020
d4079ef
Add a test per item class
yolile Jun 23, 2020
d7213d3
isort test_validate.py
yolile Jun 23, 2020
895e68e
Json schemas correct indentation
yolile Jun 23, 2020
ba8a118
Merge branch 'master' of github.com:open-contracting/kingfisher-scrap…
yolile Jun 24, 2020
126c754
Update validate pipeline
yolile Jun 24, 2020
84d9ae1
Merge branch 'master' of github.com:open-contracting/kingfisher-scrap…
yolile Jun 24, 2020
633bc1c
isort test_validate.py
yolile Jun 24, 2020
be2da56
Remove remaining title/description metadata properties. Change "type"…
jpmckinney Jun 24, 2020
74ac039
pipelines: Add trailing slash to URI so that last component is not re…
jpmckinney Jun 24, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,3 @@ class AuthenticationError(KingfisherScrapyError):

class SpiderArgumentError(KingfisherScrapyError):
"""Raised when a spider argument's value is invalid"""


class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
"""Raised when an item is missing a required field"""
28 changes: 28 additions & 0 deletions kingfisher_scrapy/item_schema/file.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"title": "File",
"type": "object",
"properties": {
"post_to_api": {
"type": [
"boolean"
]
},
"path": {
"description": "For the KingfisherProcessAPI extension to read the file.",
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
"type": [
"string"
]
},
"files_store": {
"type": [
"string"
]
}
}
}
19 changes: 19 additions & 0 deletions kingfisher_scrapy/item_schema/file_error.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "File Error",
"type": "object",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherItem"
}
],
"properties": {
"errors": {
"type": "string",
"minLength": 1
}
},
"required": [
"errors"
]
}
19 changes: 19 additions & 0 deletions kingfisher_scrapy/item_schema/file_item.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"title": "File Item",
"type": "object",
"properties": {
"number": {
"type": "integer",
"minimum": 1
}
},
"required": [
"number"
]
}
69 changes: 69 additions & 0 deletions kingfisher_scrapy/item_schema/item.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"definitions": {
"KingfisherItem": {
"title": "Kingfisher Item",
"description": "A generic item with file_name and url to be extended by other items",
"type": "object",
"properties": {
"file_name": {
"type": "string",
"pattern": "^[^/]+$"
},
"url": {
"type": "string",
"format": "uri"
}
},
"required": [
"file_name",
"url"
]
},
"KingfisherFileItem": {
"title": "Kingfisher Item",
"description": "A base object to be extended by other File type items",
"type": "object",
"allOf": [
{
"$ref": "#/definitions/KingfisherItem"
}
],
"properties": {
"data_type": {
"type": "string",
"enum": [
"record",
"release",
"record_list",
"release_list",
"compiled_release",
"record_package",
"release_package",
"record_package_list",
"release_package_list",
"record_package_list_in_results",
"release_package_list_in_results",
"release_package_json_lines",
"record_package_json_lines",
"release_package_in_ocdsReleasePackage_in_list_in_results",
"release_in_Release"
]
},
"encoding": {
"type": [
"string"
]
},
"data": {
"type": "string",
"minLength": 1
}
},
"required": [
"data",
"data_type"
]
}
}
}
36 changes: 2 additions & 34 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,12 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy

from kingfisher_scrapy.exceptions import MissingRequiredFieldError
import scrapy


class KingfisherItem(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()

def validate(self):
"""
Raises an error if any required field is missing.

:raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
"""
if hasattr(self, 'required'):
for field in self.required:
if field not in self:
raise MissingRequiredFieldError(field)
validate = True


class File(KingfisherItem):
Expand All @@ -32,34 +21,13 @@ class File(KingfisherItem):
path = scrapy.Field()
files_store = scrapy.Field()

required = [
'file_name',
'url',
'data',
'data_type',
]


class FileItem(KingfisherItem):
number = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()

required = [
'number',
'file_name',
'url',
'data',
'data_type',
]


class FileError(KingfisherItem):
errors = scrapy.Field()

required = [
'file_name',
'url',
'errors',
]
21 changes: 17 additions & 4 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
import os
import pathlib

import jsonref as jsonref
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator


class Validate:
def __init__(self):
self.validators = {}
schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'item_schema')
for item in ['file', 'file_error', 'file_item']:
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
filename = os.path.join(schema_path, f'{item}.json')
with open(filename) as f:
schema = jsonref.load(f, base_uri=pathlib.Path(os.path.join(schema_path), 'item_schema').as_uri())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, 'item_schema' is outside the os.path.join call (which we don't need if it's only one argument), and in any case schema_path already contains item_schema. I'm not sure how this works.

Since we're re-using the base_uri parameter value, we can set schema_path to it above.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jpmckinney I changed it but for some reason jsonref fails if I don't put 'item_schema' again in base_uri I'm not sure why

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, I had to add a trailing slash to the URI.

class_name = ''.join(word.title() for word in item.split('_'))
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
self.validators[class_name] = Draft4Validator(schema, format_checker=FormatChecker())

def process_item(self, item, spider):
if hasattr(item, 'validate'):
# We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
# as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
item.validate()

self.validators.get(item.__class__.__name__).validate(dict(item))
return item
3 changes: 3 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
# https://github.com/open-contracting/deploy/blob/master/salt/ocdskingfishercollect/scrapyd-requirements.txt

jsonpointer
jsonref
jsonschema
rarfile
requests
rfc3987
Scrapy
scrapyd-client
ijson>=3
Expand Down
10 changes: 8 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile
#
attrs==19.3.0 # via automat, service-identity, twisted
attrs==19.3.0 # via automat, jsonschema, service-identity, twisted
automat==0.8.0 # via twisted
certifi==2019.11.28 # via requests, sentry-sdk
cffi==1.13.2 # via cryptography
Expand All @@ -15,8 +15,11 @@ cssselect==1.1.0 # via parsel, scrapy
hyperlink==19.0.0 # via twisted
idna==2.8 # via hyperlink, requests
ijson==3.0.3
importlib-metadata==1.6.1 # via jsonschema
incremental==17.5.0 # via twisted
jsonpointer==2.0
jsonref==0.2
jsonschema==3.2.0
lxml==4.4.2 # via parsel, scrapy
parsel==1.5.2 # via scrapy
protego==0.1.16 # via scrapy
Expand All @@ -26,17 +29,20 @@ pycparser==2.19 # via cffi
pydispatcher==2.0.5 # via scrapy
pyhamcrest==1.9.0 # via twisted
pyopenssl==19.1.0 # via scrapy
pyrsistent==0.16.0 # via jsonschema
queuelib==1.5.0 # via scrapy
rarfile==3.1
requests==2.22.0
rfc3987==1.3.8
scrapy==1.8.0
scrapyd-client==1.1.0
sentry-sdk==0.14.4
service-identity==18.1.0 # via scrapy
six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib
six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib
twisted==20.3.0 # via scrapy
urllib3==1.25.7 # via requests, sentry-sdk
w3lib==1.21.0 # via parsel, scrapy
zipp==3.1.0 # via importlib-metadata
zope.interface==4.7.1 # via scrapy, twisted

# The following packages are considered to be unsafe in a requirements file:
Expand Down
10 changes: 7 additions & 3 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ flake8==3.7.9
hyperlink==19.0.0
idna==2.8
ijson==3.0.3
importlib-metadata==1.3.0 # via pluggy, pytest
importlib-metadata==1.6.1
incremental==17.5.0
isort==4.3.21
jsonpointer==2.0
jsonref==0.2
jsonschema==3.2.0
lxml==4.4.2
mccabe==0.6.1 # via flake8
more-itertools==8.0.2 # via pytest, zipp
more-itertools==8.0.2 # via pytest
packaging==19.2 # via pytest
parsel==1.5.2
pip-tools==5.1.0
Expand All @@ -43,11 +45,13 @@ pyflakes==2.1.1 # via flake8
pyhamcrest==1.9.0
pyopenssl==19.1.0
pyparsing==2.4.5 # via packaging
pyrsistent==0.16.0
pytest-cov==2.8.1
pytest==5.3.2
queuelib==1.5.0
rarfile==3.1
requests==2.22.0
rfc3987==1.3.8
scrapy==1.8.0
scrapyd-client==1.1.0
sentry-sdk==0.14.4
Expand All @@ -57,7 +61,7 @@ twisted==20.3.0
urllib3==1.25.7
w3lib==1.21.0
wcwidth==0.1.7 # via pytest
zipp==0.6.0 # via importlib-metadata
zipp==3.1.0
zope.interface==4.7.1

# The following packages are considered to be unsafe in a requirements file:
Expand Down
Loading