Skip to content

Commit

Permalink
Merge pull request #420 from open-contracting/403-json-schema-validate
Browse files Browse the repository at this point in the history
Update validation pipeline to use a json schema
  • Loading branch information
yolile committed Jun 24, 2020
2 parents 929dee2 + 74ac039 commit ec8552e
Show file tree
Hide file tree
Showing 11 changed files with 229 additions and 60 deletions.
4 changes: 0 additions & 4 deletions kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,5 @@ class SpiderArgumentError(KingfisherScrapyError):
"""Raised when a spider argument's value is invalid"""


class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
"""Raised when an item is missing a required field"""


class MissingNextLinkError(KingfisherScrapyError):
"""Raised when a next link is not found on the first page of results"""
20 changes: 20 additions & 0 deletions kingfisher_scrapy/item_schema/File.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"type": "object",
"properties": {
"post_to_api": {
"type": "boolean"
},
"path": {
"type": "string"
},
"files_store": {
"type": "string"
}
}
}
18 changes: 18 additions & 0 deletions kingfisher_scrapy/item_schema/FileError.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherItem"
}
],
"type": "object",
"properties": {
"errors": {
"type": "string",
"minLength": 1
}
},
"required": [
"errors"
]
}
18 changes: 18 additions & 0 deletions kingfisher_scrapy/item_schema/FileItem.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"type": "object",
"properties": {
"number": {
"type": "integer",
"minimum": 1
}
},
"required": [
"number"
]
}
63 changes: 63 additions & 0 deletions kingfisher_scrapy/item_schema/item.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"definitions": {
"KingfisherItem": {
"type": "object",
"properties": {
"file_name": {
"type": "string",
"pattern": "^[^/]+$"
},
"url": {
"type": "string",
"format": "uri"
}
},
"required": [
"file_name",
"url"
]
},
"KingfisherFileItem": {
"allOf": [
{
"$ref": "#/definitions/KingfisherItem"
}
],
"type": "object",
"properties": {
"data_type": {
"type": "string",
"enum": [
"record",
"release",
"record_list",
"release_list",
"compiled_release",
"record_package",
"release_package",
"record_package_list",
"release_package_list",
"record_package_list_in_results",
"release_package_list_in_results",
"release_package_json_lines",
"record_package_json_lines",
"release_package_in_ocdsReleasePackage_in_list_in_results",
"release_in_Release"
]
},
"encoding": {
"type": "string"
},
"data": {
"type": "string",
"minLength": 1
}
},
"required": [
"data",
"data_type"
]
}
}
}
36 changes: 2 additions & 34 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,12 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy

from kingfisher_scrapy.exceptions import MissingRequiredFieldError
import scrapy


class KingfisherItem(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()

def validate(self):
"""
Raises an error if any required field is missing.
:raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
"""
if hasattr(self, 'required'):
for field in self.required:
if field not in self:
raise MissingRequiredFieldError(field)
validate = True


class File(KingfisherItem):
Expand All @@ -32,34 +21,13 @@ class File(KingfisherItem):
path = scrapy.Field()
files_store = scrapy.Field()

required = [
'file_name',
'url',
'data',
'data_type',
]


class FileItem(KingfisherItem):
number = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()

required = [
'number',
'file_name',
'url',
'data',
'data_type',
]


class FileError(KingfisherItem):
errors = scrapy.Field()

required = [
'file_name',
'url',
'errors',
]
19 changes: 16 additions & 3 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals

import os
import pathlib

import jsonref as jsonref
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator

from kingfisher_scrapy.items import File, FileItem


class Validate:
def __init__(self):
self.validators = {}
self.files = set()
self.file_items = set()
schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema')
for item in ('File', 'FileError', 'FileItem'):
filename = os.path.join(schema_path, f'{item}.json')
with open(filename) as f:
schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/')
self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker())

def process_item(self, item, spider):
if hasattr(item, 'validate'):
# We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
# as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
item.validate()
self.validators.get(item.__class__.__name__).validate(dict(item))

if isinstance(item, FileItem):
key = (item['file_name'], item['number'])
Expand Down
3 changes: 3 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@

ijson>=3
jsonpointer
jsonref
jsonschema
rarfile
requests
rfc3987
Scrapy
scrapyd-client
sentry-sdk
10 changes: 8 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile
#
attrs==19.3.0 # via automat, service-identity, twisted
attrs==19.3.0 # via automat, jsonschema, service-identity, twisted
automat==0.8.0 # via twisted
certifi==2019.11.28 # via requests, sentry-sdk
cffi==1.13.2 # via cryptography
Expand All @@ -15,8 +15,11 @@ cssselect==1.1.0 # via parsel, scrapy
hyperlink==19.0.0 # via twisted
idna==2.8 # via hyperlink, requests
ijson==3.0.3
importlib-metadata==1.6.1 # via jsonschema
incremental==17.5.0 # via twisted
jsonpointer==2.0
jsonref==0.2
jsonschema==3.2.0
lxml==4.4.2 # via parsel, scrapy
parsel==1.5.2 # via scrapy
protego==0.1.16 # via scrapy
Expand All @@ -26,17 +29,20 @@ pycparser==2.19 # via cffi
pydispatcher==2.0.5 # via scrapy
pyhamcrest==1.9.0 # via twisted
pyopenssl==19.1.0 # via scrapy
pyrsistent==0.16.0 # via jsonschema
queuelib==1.5.0 # via scrapy
rarfile==3.1
requests==2.22.0
rfc3987==1.3.8
scrapy==1.8.0
scrapyd-client==1.1.0
sentry-sdk==0.14.4
service-identity==18.1.0 # via scrapy
six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib
six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib
twisted==20.3.0 # via scrapy
urllib3==1.25.7 # via requests, sentry-sdk
w3lib==1.21.0 # via parsel, scrapy
zipp==3.1.0 # via importlib-metadata
zope.interface==4.7.1 # via scrapy, twisted

# The following packages are considered to be unsafe in a requirements file:
Expand Down
10 changes: 7 additions & 3 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ flake8==3.7.9
hyperlink==19.0.0
idna==2.8
ijson==3.0.3
importlib-metadata==1.3.0 # via pluggy, pytest
importlib-metadata==1.6.1
incremental==17.5.0
isort==4.3.21
jsonpointer==2.0
jsonref==0.2
jsonschema==3.2.0
lxml==4.4.2
mccabe==0.6.1 # via flake8
more-itertools==8.0.2 # via pytest, zipp
more-itertools==8.0.2 # via pytest
packaging==19.2 # via pytest
parsel==1.5.2
pip-tools==5.1.0
Expand All @@ -43,11 +45,13 @@ pyflakes==2.1.1 # via flake8
pyhamcrest==1.9.0
pyopenssl==19.1.0
pyparsing==2.4.5 # via packaging
pyrsistent==0.16.0
pytest-cov==2.8.1
pytest==5.3.2
queuelib==1.5.0
rarfile==3.1
requests==2.22.0
rfc3987==1.3.8
scrapy==1.8.0
scrapyd-client==1.1.0
sentry-sdk==0.14.4
Expand All @@ -57,7 +61,7 @@ twisted==20.3.0
urllib3==1.25.7
w3lib==1.21.0
wcwidth==0.1.7 # via pytest
zipp==0.6.0 # via importlib-metadata
zipp==3.1.0
zope.interface==4.7.1

# The following packages are considered to be unsafe in a requirements file:
Expand Down
Loading

0 comments on commit ec8552e

Please sign in to comment.