diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py index f1f0e1910..080eded2b 100644 --- a/kingfisher_scrapy/exceptions.py +++ b/kingfisher_scrapy/exceptions.py @@ -10,9 +10,5 @@ class SpiderArgumentError(KingfisherScrapyError): """Raised when a spider argument's value is invalid""" -class MissingRequiredFieldError(KingfisherScrapyError, KeyError): - """Raised when an item is missing a required field""" - - class MissingNextLinkError(KingfisherScrapyError): """Raised when a next link is not found on the first page of results""" diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json new file mode 100644 index 000000000..c726798e4 --- /dev/null +++ b/kingfisher_scrapy/item_schema/File.json @@ -0,0 +1,20 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "type": "object", + "properties": { + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string" + }, + "files_store": { + "type": "string" + } + } +} diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json new file mode 100644 index 000000000..8f1b935d0 --- /dev/null +++ b/kingfisher_scrapy/item_schema/FileError.json @@ -0,0 +1,18 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherItem" + } + ], + "type": "object", + "properties": { + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "errors" + ] +} diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json new file mode 100644 index 000000000..3e49413fb --- /dev/null +++ b/kingfisher_scrapy/item_schema/FileItem.json @@ -0,0 +1,18 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "type": "object", + "properties": { + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number" + ] +} diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json new file mode 100644 index 000000000..7ded5a014 --- /dev/null +++ b/kingfisher_scrapy/item_schema/item.json @@ -0,0 +1,62 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "definitions": { + "KingfisherItem": { + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]+$" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": [ + "file_name", + "url" + ] + }, + "KingfisherFileItem": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], + "type": "object", + "properties": { + "data_type": { + "type": "string", + "enum": [ + "record", + "release", + "record_list", + "release_list", + "compiled_release", + "record_package", + "release_package", + "record_package_list", + "release_package_list", + "record_package_list_in_results", + "release_package_list_in_results", + "release_package_json_lines", + "record_package_json_lines", + "release_package_in_ocdsReleasePackage_in_list_in_results", + "release_in_Release" + ] + }, + "encoding": { + "type": "string" + }, + "data": { + "minLength": 1 + } + }, + "required": [ + "data", + "data_type" + ] + } + } +} diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index a05d5c2bf..7ce0e95e5 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -1,23 +1,12 @@ # https://docs.scrapy.org/en/latest/topics/items.html -import scrapy -from kingfisher_scrapy.exceptions import MissingRequiredFieldError +import scrapy class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - - def validate(self): - """ - Raises an error if any required field is missing. - - :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing - """ - if hasattr(self, 'required'): - for field in self.required: - if field not in self: - raise MissingRequiredFieldError(field) + validate = True class File(KingfisherItem): @@ -32,13 +21,6 @@ class File(KingfisherItem): path = scrapy.Field() files_store = scrapy.Field() - required = [ - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileItem(KingfisherItem): number = scrapy.Field() @@ -46,20 +28,6 @@ class FileItem(KingfisherItem): data_type = scrapy.Field() encoding = scrapy.Field() - required = [ - 'number', - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileError(KingfisherItem): errors = scrapy.Field() - - required = [ - 'file_name', - 'url', - 'errors', - ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 72255757e..68172e696 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,18 +1,31 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals + +import os +import pathlib + +import jsonref as jsonref +from jsonschema import FormatChecker +from jsonschema.validators import Draft4Validator + from kingfisher_scrapy.items import File, FileItem class Validate: def __init__(self): + self.validators = {} self.files = set() self.file_items = set() + schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + for item in ('File', 'FileError', 'FileItem'): + filename = os.path.join(schema_path, f'{item}.json') + with open(filename) as f: + schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/') + self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): if hasattr(item, 'validate'): - # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't - # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item.validate() + self.validators.get(item.__class__.__name__).validate(dict(item)) if isinstance(item, FileItem): key = (item['file_name'], item['number']) diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index 7bd84cf2e..9e5060e7d 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -7,6 +7,13 @@ class AfghanistanRecords(SimpleSpider): + """ + API documentation + https://ocds.ageops.net/ + Spider arguments + sample + Downloads the first record returned by the record list endpoint. + """ name = 'afghanistan_records' data_type = 'record' diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 80653118d..7257cacaf 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -7,6 +7,13 @@ class AfghanistanReleases(SimpleSpider): + """ + API documentation + https://ocds.ageops.net/ + Spider arguments + sample + Downloads the first release returned by the release endpoint of the API. + """ name = 'afghanistan_releases' data_type = 'release' diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index 4b1461a65..d6d3bbf67 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -8,10 +8,10 @@ class ArgentinaBuenosAires(ZipSpider): """ - Bulk download documentation - https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc API documentation https://data.buenosaires.gob.ar/acerca/ckan + Bulk download documentation + https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc Spider arguments sample Downloads the zip file and sends 10 releases to kingfisher process. diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 160ec7902..385ff6652 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -4,6 +4,13 @@ class ArgentinaVialidad(SimpleSpider): + """ + API documentation + https://datosabiertos.vialidad.gob.ar/ui/index.html#!/datos_abiertos + Spider arguments + sample + Ignored, data is downloaded from a single JSON file. + """ name = 'argentina_vialidad' data_type = 'release_package_list' diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index 546e7fcb8..416f78d61 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -3,6 +3,13 @@ class CanadaBuyAndSell(SimpleSpider): + """ + API documentation + https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data + Spider arguments + sample + Downloads a release package with data for the oldest fiscal year available (2013-2014). + """ name = 'canada_buyandsell' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 25b5ea275..39778c882 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -7,6 +7,13 @@ class CanadaMontreal(SimpleSpider): + """ + API documentation + http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api + Spider arguments + sample + Downloads the first page of releases returned by the main endpoint. + """ name = 'canada_montreal' data_type = 'release_package' step = 10000 diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index dbac11d97..e4eb357de 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -29,14 +29,22 @@ class Colombia(LinksSpider): If ``from_date`` is provided and ``until_date`` don't, defaults to today. """ name = 'colombia' - next_page_formatter = staticmethod(parameters('page')) + next_page_formatter = staticmethod(parameters('_id')) default_from_date = '2011-01-01' + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super().from_crawler(crawler, *args, **kwargs) + if (spider.from_date or spider.until_date) and hasattr(spider, 'year'): + raise scrapy.exceptions.CloseSpider('You cannot specify both a year spider argument and ' + 'from_date/until_date spider argument(s).') + return spider + def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' if hasattr(self, 'year'): base_url += f'/page/{int(self.year)}' - if self.from_date or self.until_date: + elif self.from_date or self.until_date: from_date = self.from_date.strftime(self.date_format) until_date = self.until_date.strftime(self.date_format) base_url += f'/dates/{from_date}/{until_date}' diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 9d7121de7..85aa10b45 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -9,6 +9,13 @@ class DominicanRepublic(BaseSpider): + """ + Bulk download documentation + https://www.dgcp.gob.do/estandar-mundial-ocds/ + Spider arguments + sample + Downloads a release package for the oldest year (2018, first link in the downloads page). + """ name = 'dominican_republic' def start_requests(self): diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 57423f6c9..bb3586589 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -7,6 +7,13 @@ class France(SimpleSpider): + """ + Swagger API documentation + https://doc.data.gouv.fr/api/reference/ + Spider arguments + sample + Downloads the first OCDS package found using the CKAN API. + """ name = 'france' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index 66fe6d7a2..aa5d28c00 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -5,6 +5,13 @@ class GeorgiaRecords(LinksSpider): + """ + Swagger API documentation + https://odapi.spa.ge/api/swagger.ui + Spider arguments + sample + Downloads the first page of packages returned by the record list endpoint. + """ name = 'georgia_records' data_type = 'record_package' next_page_formatter = staticmethod(parameters('page')) diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index 51954dfbd..6f10324d8 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -5,6 +5,13 @@ class GeorgiaReleases(LinksSpider): + """ + Swagger API documentation + https://odapi.spa.ge/api/swagger.ui + Spider arguments + sample + Downloads the first page of packages returned by the release list endpoint. + """ name = 'georgia_releases' data_type = 'release_package' next_page_formatter = staticmethod(parameters('page')) diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index b0525e1f7..d58c9ad45 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -4,6 +4,11 @@ class HondurasCoST(SimpleSpider): + """ + Spider arguments + sample + Ignored, a single file is downloaded. + """ name = 'honduras_cost' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 0d770660e..8ee680ac5 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -5,6 +5,13 @@ class HondurasONCAE(ZipSpider): + """ + Bulk download documentation + http://oncae.gob.hn/datosabiertos + Spider arguments + sample + Downloads the first package listed on the downloads page. + """ name = 'honduras_oncae' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index f82b55236..2ba869669 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -7,6 +7,13 @@ class HondurasPortalBulkFiles(SimpleSpider): + """ + Bulk download documentation + http://www.contratacionesabiertas.gob.hn/descargas/ + Spider arguments + sample + Downloads the first package listed in http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json. + """ name = 'honduras_portal_bulk_files' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index 535f4065d..32a42be8b 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -26,6 +26,9 @@ class IndonesiaBandung(BaseSpider): contract year number uniqid id number + Spider arguments + sample + Downloads the first release listed for 2013 """ name = 'indonesia_bandung' diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 977990595..5183c5fe0 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -9,7 +9,11 @@ class MexicoAdministracionPublicaFederal(SimpleSpider): """ - Bulk downloads: https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf + Bulk download documentation + https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf + Spider arguments + sample + Downloads the records on the first page of the list endpoint. """ name = 'mexico_administracion_publica_federal' data_type = 'record_package_list_in_results' diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index fa9134a8a..04d9698a5 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -5,6 +5,11 @@ class Moldova(SimpleSpider): + """ + Spider arguments + sample + Downloads the first page of records for each available endpoint (budgets, tenders). + """ name = 'moldova' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index db50f1d71..938b1113e 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -3,6 +3,13 @@ class MoldovaOld(SimpleSpider): + """ + Bulk download documentation + http://opencontracting.date.gov.md/downloads + Spider arguments + sample + Downloads a single JSON file containing data for 2017. + """ name = 'moldova_old' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index e8cb5973e..b35330105 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -5,6 +5,13 @@ class Scotland(SimpleSpider): + """ + API documentation + https://api.publiccontractsscotland.gov.uk/v1 + Spider arguments + sample + Downloads packages for releases dated one year ago, for each notice type available. + """ name = 'scotland' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index 07cf63842..5fd8407c7 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -5,6 +5,11 @@ class UKContractsFinder(SimpleSpider): + """ + Spider arguments + sample + Downloads the first page of release packages returned by the main endpoint. + """ name = 'uk_contracts_finder' data_type = 'release_package_list_in_results' encoding = 'iso-8859-1' diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index ca15b7ebc..a26c32d49 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -5,6 +5,11 @@ class UKContractsFinder(LinksSpider): + """ + Spider arguments + sample + Downloads the first release package returned by the main endpoint. + """ name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' next_page_formatter = staticmethod(parameters('cursor')) diff --git a/requirements.in b/requirements.in index cb4eca0d0..41899567b 100644 --- a/requirements.in +++ b/requirements.in @@ -3,8 +3,11 @@ ijson>=3 jsonpointer +jsonref +jsonschema rarfile requests +rfc3987 Scrapy scrapyd-client sentry-sdk diff --git a/requirements.txt b/requirements.txt index 0bea0b9df..65ebc1554 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -attrs==19.3.0 # via automat, service-identity, twisted +attrs==19.3.0 # via automat, jsonschema, service-identity, twisted automat==0.8.0 # via twisted certifi==2019.11.28 # via requests, sentry-sdk cffi==1.13.2 # via cryptography @@ -15,8 +15,11 @@ cssselect==1.1.0 # via parsel, scrapy hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 +importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonref==0.2 +jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy protego==0.1.16 # via scrapy @@ -26,17 +29,20 @@ pycparser==2.19 # via cffi pydispatcher==2.0.5 # via scrapy pyhamcrest==1.9.0 # via twisted pyopenssl==19.1.0 # via scrapy +pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests, sentry-sdk w3lib==1.21.0 # via parsel, scrapy +zipp==3.1.0 # via importlib-metadata zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements_dev.txt b/requirements_dev.txt index 9f58a35a3..e010b745f 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -21,13 +21,15 @@ flake8==3.7.9 hyperlink==19.0.0 idna==2.8 ijson==3.0.3 -importlib-metadata==1.3.0 # via pluggy, pytest +importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonref==0.2 +jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 -more-itertools==8.0.2 # via pytest, zipp +more-itertools==8.0.2 # via pytest packaging==19.2 # via pytest parsel==1.5.2 pip-tools==5.1.0 @@ -43,11 +45,13 @@ pyflakes==2.1.1 # via flake8 pyhamcrest==1.9.0 pyopenssl==19.1.0 pyparsing==2.4.5 # via packaging +pyrsistent==0.16.0 pytest-cov==2.8.1 pytest==5.3.2 queuelib==1.5.0 rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 @@ -57,7 +61,7 @@ twisted==20.3.0 urllib3==1.25.7 w3lib==1.21.0 wcwidth==0.1.7 # via pytest -zipp==0.6.0 # via importlib-metadata +zipp==3.1.0 zope.interface==4.7.1 # The following packages are considered to be unsafe in a requirements file: diff --git a/setup.py b/setup.py index ad683d5b7..784a06960 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,10 @@ 'kingfisher_scrapy', 'kingfisher_scrapy.spiders', ], + package_data={ + 'kingfisher_scrapy': ['item_schema/*.json'], + }, + include_package_data=True, entry_points={ 'scrapy': [ 'settings = kingfisher_scrapy.settings', diff --git a/tests/test_validate.py b/tests/test_validate.py index a89fa4ba3..6233a1799 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest +from jsonschema import ValidationError -from kingfisher_scrapy.exceptions import MissingRequiredFieldError -from kingfisher_scrapy.items import File, FileItem +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate from tests import spider_with_crawler @@ -9,20 +9,85 @@ def test_process_item(): pipeline = Validate() item = File({ - 'file_name': '', - 'data': '', - 'data_type': '', - 'url': '', + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', }) assert pipeline.process_item(item, None) == item + item['data'] = item['data'].encode('ascii') + item['file_name'] = 'test2' + + assert pipeline.process_item(item, None) == item + def test_process_item_error(): pipeline = Validate() - item = File() + item = File({ + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + }) + + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['file_name'] = 'test' + item['data_type'] = 'not a valid data type' + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_item(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': 1 + }) + assert pipeline.process_item(item, None) == item + - with pytest.raises(MissingRequiredFieldError): +def test_process_file_item_error(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': "2" + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['number'] = None + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com', + 'errors': 'Error' + }) + assert pipeline.process_item(item, None) == item + + +def test_process_file_item_error_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com' + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['errors'] = 'Error' + item['url'] = 'not an url' + with pytest.raises(ValidationError): pipeline.process_item(item, None) @@ -31,9 +96,9 @@ def test_duplicate_file(caplog): spider = spider_with_crawler() item = File({ 'file_name': 'test1', - 'data': '', - 'data_type': '', - 'url': '', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://example.com', }) pipeline.process_item(item, spider) @@ -51,9 +116,9 @@ def test_duplicate_file_item(caplog): spider = spider_with_crawler() item = FileItem({ 'file_name': 'test1', - 'data': '', - 'data_type': '', - 'url': '', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://example.com', 'number': 1 })