From 9e9e6e32d0f763102fbb0099d24849bbf19a0967 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:33:54 -0400 Subject: [PATCH 01/43] Add Afghanistan docstrings --- kingfisher_scrapy/spiders/afghanistan_records.py | 7 +++++++ kingfisher_scrapy/spiders/afghanistan_releases.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index d2827f0b5..6596b9e51 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -7,6 +7,13 @@ class AfghanistanRecords(BaseSpider): + """ + API documentation + https://ocds.ageops.net/ + Spider arguments + sample + Download only 1 record. + """ name = 'afghanistan_records' download_delay = 1 diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index ec197c4fe..2983e84e3 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -7,6 +7,13 @@ class AfghanistanReleases(BaseSpider): + """ + API documentation + https://ocds.ageops.net/ + Spider arguments + sample + Download only 1 release. + """ name = 'afghanistan_releases' download_delay = 1.5 From 9d5c2f0f5544961379822280e080ce0d28c5e6f8 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:37:19 -0400 Subject: [PATCH 02/43] Add Argentina docstrings --- kingfisher_scrapy/spiders/argentina_buenos_aires.py | 4 ++-- kingfisher_scrapy/spiders/argentina_vialidad.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index a434a0164..a1e257eac 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -8,10 +8,10 @@ class ArgentinaBuenosAires(ZipSpider): """ - Bulk download documentation - https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc API documentation https://data.buenosaires.gob.ar/acerca/ckan + Bulk download documentation + https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc Spider arguments sample Downloads the zip file and sends 10 releases to kingfisher process. diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 898b366dc..5b39ce66a 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -5,6 +5,13 @@ class ArgentinaVialidad(BaseSpider): + """ + API documentation + https://datosabiertos.vialidad.gob.ar/ui/index.html#!/datos_abiertos + Spider arguments + sample + Download one set of releases. + """ name = 'argentina_vialidad' def start_requests(self): From cff3c1b180c05bb9bf7e2394c75f770542389daa Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:37:46 -0400 Subject: [PATCH 03/43] Add Canada docstrings --- kingfisher_scrapy/spiders/canada_buyandsell.py | 7 +++++++ kingfisher_scrapy/spiders/canada_montreal.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index e63d2a93e..8583e9142 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -5,6 +5,13 @@ class CanadaBuyAndSell(BaseSpider): + """ + API documentation + https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data + Spider arguments + sample + Download only one set of releases. + """ name = "canada_buyandsell" def start_requests(self): diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 5953656ca..aaf464080 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -7,6 +7,13 @@ class CanadaMontreal(BaseSpider): + """ + API documentation + http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api + Spider arguments + sample + Download only the first page. + """ name = 'canada_montreal' page_limit = 10000 From c2fb9da86f560759997830a5c160e55c8cd55021 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:38:09 -0400 Subject: [PATCH 04/43] Add Dominican Republic docstrings --- kingfisher_scrapy/spiders/dominican_republic.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index ac4a14e80..726076dce 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -9,6 +9,13 @@ class DominicanRepublic(BaseSpider): + """ + Bulk download documentation + https://www.dgcp.gob.do/estandar-mundial-ocds/ + Spider arguments + sample + Download only one set of releases. + """ name = 'dominican_republic' custom_settings = { 'DOWNLOAD_TIMEOUT': 360 From 9a5bcb27f5ef0692ef126bce1ccfc08135d3b5dc Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:38:28 -0400 Subject: [PATCH 05/43] Add France docstrings --- kingfisher_scrapy/spiders/france.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 4d7fcc13f..2849036ac 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -8,6 +8,13 @@ class France(BaseSpider): + """ + Swagger API documentation + https://doc.data.gouv.fr/api/reference/ + Spider arguments + sample + Download one set of releases. + """ name = "france" def start_requests(self): From 324278c178bfc1ea015dd5919f38eb02e969418c Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:38:56 -0400 Subject: [PATCH 06/43] Add Georgia docstrings --- kingfisher_scrapy/spiders/georgia_records.py | 7 +++++++ kingfisher_scrapy/spiders/georgia_releases.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index fc257370c..543bade1a 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -4,6 +4,13 @@ class GeorgiaRecords(LinksSpider): + """ + Swagger API documentation + https://odapi.spa.ge/api/swagger.ui + Spider arguments + sample + Download one set of releases. + """ name = 'georgia_records' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index fd0ed606a..b640acfde 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -4,6 +4,13 @@ class GeorgiaReleases(LinksSpider): + """ + Swagger API documentation + https://odapi.spa.ge/api/swagger.ui + Spider arguments + sample + Download one set of releases. + """ name = 'georgia_releases' data_type = 'release_package' From a3078058e943ca795ead1c661b3103d0beb3840b Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:39:15 -0400 Subject: [PATCH 07/43] Add Honduras docstrings --- kingfisher_scrapy/spiders/honduras_cost.py | 5 +++++ kingfisher_scrapy/spiders/honduras_oncae.py | 7 +++++++ kingfisher_scrapy/spiders/honduras_portal_bulk_files.py | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index 950c2e141..736922d26 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -7,6 +7,11 @@ class HondurasCoST(BaseSpider): + """ + Spider arguments + sample + Download only 20 records. + """ name = 'honduras_cost' start_urls = ['http://app.sisocs.org/protected/ocdsShow/'] diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 7912b3728..2738592e5 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -7,6 +7,13 @@ class HondurasONCAE(ZipSpider): + """ + Bulk download documentation + http://oncae.gob.hn/datosabiertos + Spider arguments + sample + Download one set of releases. + """ name = 'honduras_oncae' start_urls = ['http://oncae.gob.hn/datosabiertos'] diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 01efbed89..8f91c25e8 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -8,6 +8,13 @@ class HondurasPortalBulkFiles(BaseSpider): + """ + Bulk download documentation + http://www.contratacionesabiertas.gob.hn/descargas/ + Spider arguments + sample + Download one set of releases. + """ name = 'honduras_portal_bulk_files' def start_requests(self): From 6256a144b13cbfef5c046ea57966d5a2270cda96 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:40:31 -0400 Subject: [PATCH 08/43] Add Indonesia Bandung docstrings --- kingfisher_scrapy/spiders/indonesia_bandung.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index cec93b1e1..f2b3689ba 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -9,6 +9,11 @@ class IndonesiaBandung(BaseSpider): + """ + Spider arguments + sample + Download only 10 releases. + """ name = 'indonesia_bandung' def start_requests(self): From 8d2e64b34865cd5f142e455c17c8df1cae932c38 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:41:14 -0400 Subject: [PATCH 09/43] Add Mexico Administracion Publica Federal docstrings --- .../spiders/mexico_administracion_publica_federal.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 00536da39..428b859df 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -8,7 +8,11 @@ class MexicoAdministracionPublicaFederal(BaseSpider): """ - Bulk downloads: https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf + Bulk download documentation + https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf + Spider arguments + sample + Download only 100 records. """ name = 'mexico_administracion_publica_federal' From 2ab2460615448e320fe4d9a0529dd21163b4fbee Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:41:48 -0400 Subject: [PATCH 10/43] Add Moldova docstrings --- kingfisher_scrapy/spiders/moldova.py | 5 +++++ kingfisher_scrapy/spiders/moldova_old.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index e87e9a88c..ae26bbea2 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -7,6 +7,11 @@ class Moldova(BaseSpider): + """ + Spider arguments + sample + Download only one set of records. + """ name = 'moldova' endpoints = {"budgets": "https://public.mtender.gov.md/budgets/", diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 267536ed0..b2544fd85 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -5,6 +5,13 @@ class MoldovaOld(BaseSpider): + """ + Bulk download documentation + http://opencontracting.date.gov.md/downloads + Spider arguments + sample + Download only data released on 2017. + """ name = 'moldova_old' def start_requests(self): From 2c1833682aace4d3c8231d72c64666620cf67c54 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:42:08 -0400 Subject: [PATCH 11/43] Add Scotland docstrings --- kingfisher_scrapy/spiders/scotland.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 49eb7597c..08c53789a 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -7,6 +7,13 @@ class Scotland(BaseSpider): + """ + API documentation + https://api.publiccontractsscotland.gov.uk/v1 + Spider arguments + sample + Download only 21 releases. + """ name = 'scotland' notice_types = [ From 2131641feee1d31e65019671c54d1acfe1e00e6c Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Fri, 29 May 2020 14:42:25 -0400 Subject: [PATCH 12/43] Add UK docstrings --- kingfisher_scrapy/spiders/uk_contracts_finder.py | 5 +++++ kingfisher_scrapy/spiders/uk_fts.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index d1629a754..cc16c00bb 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -7,6 +7,11 @@ class UKContractsFinder(BaseSpider): + """ + Spider arguments + sample + Download only 100 release packages. + """ name = 'uk_contracts_finder' base_url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index a876784ec..0f23b0a19 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -4,6 +4,11 @@ class UKContractsFinder(LinksSpider): + """ + Spider arguments + sample + Download only 1 release package. + """ name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' From 48fbf86f6692d36b2d9d658bb54bbce1d3e1012e Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:01:58 -0400 Subject: [PATCH 13/43] Update validation pipeline to use a json schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/exceptions.py | 4 - kingfisher_scrapy/item_schema.json | 137 +++++++++++++++++++++++++++++ kingfisher_scrapy/items.py | 16 +--- kingfisher_scrapy/pipelines.py | 19 +++- 4 files changed, 157 insertions(+), 19 deletions(-) create mode 100644 kingfisher_scrapy/item_schema.json diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py index da83a3a5f..f69ab051e 100644 --- a/kingfisher_scrapy/exceptions.py +++ b/kingfisher_scrapy/exceptions.py @@ -8,7 +8,3 @@ class AuthenticationError(KingfisherScrapyError): class SpiderArgumentError(KingfisherScrapyError): """Raised when a spider argument's value is invalid""" - - -class MissingRequiredFieldError(KingfisherScrapyError, KeyError): - """Raised when an item is missing a required field""" diff --git a/kingfisher_scrapy/item_schema.json b/kingfisher_scrapy/item_schema.json new file mode 100644 index 000000000..f0eb76e2f --- /dev/null +++ b/kingfisher_scrapy/item_schema.json @@ -0,0 +1,137 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "Schema Kingfisher Collect File, FileItem and FileError", + "description": "", + "oneOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/FileItem" + }, + { + "$ref": "#/definitions/FileError" + } + ], + "definitions": + { + "File": { + "title": "File", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string", + "title": "For the KingfisherProcessAPI extension to read the file." + }, + "files_store": { + "type": "string" + } + }, + "required": [ + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileItem": { + "title": "File Item", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + + }, + "data": { + "type": "string", + "minLength": 1 + + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number", + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileError": { + "title": "File Error", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + + }, + "errors": { + "type": "string", + "minLength": 1 + + } + }, + "required": [ + "file_name", + "url", + "errors" + ] + } + } + } diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index a05d5c2bf..73da7550e 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -1,23 +1,13 @@ # https://docs.scrapy.org/en/latest/topics/items.html -import scrapy -from kingfisher_scrapy.exceptions import MissingRequiredFieldError +import scrapy class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - - def validate(self): - """ - Raises an error if any required field is missing. - - :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing - """ - if hasattr(self, 'required'): - for field in self.required: - if field not in self: - raise MissingRequiredFieldError(field) + # indicate that this item should be validated against a schema + validate = True class File(KingfisherItem): diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 2fc034159..d6338d9f1 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,12 +1,27 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals +import json +import os + +from jsonschema import FormatChecker +from jsonschema.validators import Draft4Validator class Validate: + def __init__(self): + here = os.path.dirname(os.path.abspath(__file__)) + filename = os.path.join(here, 'item_schema.json') + with open(filename) as f: + schema = json.load(f) + + self.validator = Draft4Validator(schema, format_checker=FormatChecker()) + def process_item(self, item, spider): if hasattr(item, 'validate'): # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item.validate() - + item_str = json.dumps(item.__dict__) + json_item = json.loads(item_str)['_values'] + print(json_item) + self.validator.validate(json_item) return item From 2b2be515c1f1e0c7f4639e2584edc8d91d4e47c3 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:36:59 -0400 Subject: [PATCH 14/43] Update test and json schema to include all data types Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema.json | 137 ---------------- kingfisher_scrapy/item_schema/dataType.csv | 16 ++ .../item_schema/item_schema.json | 147 ++++++++++++++++++ kingfisher_scrapy/pipelines.py | 2 +- tests/test_validate.py | 12 +- 5 files changed, 170 insertions(+), 144 deletions(-) delete mode 100644 kingfisher_scrapy/item_schema.json create mode 100644 kingfisher_scrapy/item_schema/dataType.csv create mode 100644 kingfisher_scrapy/item_schema/item_schema.json diff --git a/kingfisher_scrapy/item_schema.json b/kingfisher_scrapy/item_schema.json deleted file mode 100644 index f0eb76e2f..000000000 --- a/kingfisher_scrapy/item_schema.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema Kingfisher Collect File, FileItem and FileError", - "description": "", - "oneOf": [ - { - "$ref": "#/definitions/File" - }, - { - "$ref": "#/definitions/FileItem" - }, - { - "$ref": "#/definitions/FileError" - } - ], - "definitions": - { - "File": { - "title": "File", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, - "data": { - "type": "string", - "minLength": 1 - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, - "post_to_api": { - "type": "boolean" - }, - "path": { - "type": "string", - "title": "For the KingfisherProcessAPI extension to read the file." - }, - "files_store": { - "type": "string" - } - }, - "required": [ - "file_name", - "url", - "data", - "data_type" - ] - }, - "FileItem": { - "title": "File Item", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - - }, - "data": { - "type": "string", - "minLength": 1 - - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, - "number": { - "type": "integer", - "minimum": 1 - } - }, - "required": [ - "number", - "file_name", - "url", - "data", - "data_type" - ] - }, - "FileError": { - "title": "File Error", - "type": "object", - "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - - }, - "errors": { - "type": "string", - "minLength": 1 - - } - }, - "required": [ - "file_name", - "url", - "errors" - ] - } - } - } diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv new file mode 100644 index 000000000..0e276db01 --- /dev/null +++ b/kingfisher_scrapy/item_schema/dataType.csv @@ -0,0 +1,16 @@ +Code,Title,Description +record,Record, +release,Release, +record_list,Record List, +release_list,Release List, +compiled_release,Compiled Release, +record_package,Record Package, +release_package,Release Package, +record_package_list,Record Package List +release_package_list,Release Package List, +record_package_list_in_results,Record Package List in Results, +release_package_list_in_results,Release Package List in Results, +release_package_json_lines,Release Package Json Lines, +record_package_json_lines,Record Package Json Lines, +release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results, +release_in_Release, Release in Release, diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json new file mode 100644 index 000000000..b8285ccb9 --- /dev/null +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -0,0 +1,147 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "Schema Kingfisher Collect File, FileItem and FileError", + "description": "", + "oneOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/FileItem" + }, + { + "$ref": "#/definitions/FileError" + } + ], + "definitions": { + "File": { + "title": "File", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "record", + "release", + "record_list", + "release_list", + "compiled_release", + "record_package", + "release_package", + "record_package_list", + "release_package_list", + "record_package_list_in_results", + "release_package_list_in_results", + "release_package_json_lines", + "record_package_json_lines", + "release_package_in_ocdsReleasePackage_in_list_in_results", + "release_in_Release" + ], + "openCodelist": false, + "codelist": "dataType.csv", + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "post_to_api": { + "type": "boolean" + }, + "path": { + "type": "string", + "title": "For the KingfisherProcessAPI extension to read the file." + }, + "files_store": { + "type": "string" + } + }, + "required": [ + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileItem": { + "title": "File Item", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "data": { + "type": "string", + "minLength": 1 + }, + "data_type": { + "type": "string", + "enum": [ + "release_package", + "record_package" + ], + "minLength": 1 + }, + "encoding": { + "type": "string" + }, + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number", + "file_name", + "url", + "data", + "data_type" + ] + }, + "FileError": { + "title": "File Error", + "type": "object", + "properties": { + "file_name": { + "type": "string", + "pattern": "^[^/]*$", + "minLength": 1 + }, + "url": { + "type": "string", + "format": "uri", + "minLength": 1 + }, + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "file_name", + "url", + "errors" + ] + } + } +} diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index d6338d9f1..e737481e3 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -10,7 +10,7 @@ class Validate: def __init__(self): here = os.path.dirname(os.path.abspath(__file__)) - filename = os.path.join(here, 'item_schema.json') + filename = os.path.join(here, 'item_schema', 'item_schema.json') with open(filename) as f: schema = json.load(f) diff --git a/tests/test_validate.py b/tests/test_validate.py index 9ebac699c..da800debc 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,6 +1,6 @@ import pytest +from jsonschema import ValidationError -from kingfisher_scrapy.exceptions import MissingRequiredFieldError from kingfisher_scrapy.items import File from kingfisher_scrapy.pipelines import Validate @@ -8,10 +8,10 @@ def test_process_item(): pipeline = Validate() item = File({ - 'file_name': '', - 'data': '', - 'data_type': '', - 'url': '', + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', }) assert pipeline.process_item(item, None) == item @@ -21,5 +21,5 @@ def test_process_item_error(): pipeline = Validate() item = File() - with pytest.raises(MissingRequiredFieldError): + with pytest.raises(ValidationError): pipeline.process_item(item, None) From eb3482b5e5bb63e21daea206d2e02f2af7823a25 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:37:40 -0400 Subject: [PATCH 15/43] Add jsonschema to requirements Signed-off-by: Yohanna Lisnichuk --- requirements.in | 3 ++- requirements.txt | 8 ++++++-- requirements_dev.txt | 8 +++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/requirements.in b/requirements.in index f791df779..0300c24d6 100644 --- a/requirements.in +++ b/requirements.in @@ -7,4 +7,5 @@ requests Scrapy scrapyd-client ijson>=3 -sentry-sdk \ No newline at end of file +sentry-sdk +jsonschema \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0bea0b9df..39e3a499b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -attrs==19.3.0 # via automat, service-identity, twisted +attrs==19.3.0 # via automat, jsonschema, service-identity, twisted automat==0.8.0 # via twisted certifi==2019.11.28 # via requests, sentry-sdk cffi==1.13.2 # via cryptography @@ -15,8 +15,10 @@ cssselect==1.1.0 # via parsel, scrapy hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 +importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy protego==0.1.16 # via scrapy @@ -26,6 +28,7 @@ pycparser==2.19 # via cffi pydispatcher==2.0.5 # via scrapy pyhamcrest==1.9.0 # via twisted pyopenssl==19.1.0 # via scrapy +pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 @@ -33,10 +36,11 @@ scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests, sentry-sdk w3lib==1.21.0 # via parsel, scrapy +zipp==3.1.0 # via importlib-metadata zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements_dev.txt b/requirements_dev.txt index 9f58a35a3..06b5256d0 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -21,13 +21,14 @@ flake8==3.7.9 hyperlink==19.0.0 idna==2.8 ijson==3.0.3 -importlib-metadata==1.3.0 # via pluggy, pytest +importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 -more-itertools==8.0.2 # via pytest, zipp +more-itertools==8.0.2 # via pytest packaging==19.2 # via pytest parsel==1.5.2 pip-tools==5.1.0 @@ -43,6 +44,7 @@ pyflakes==2.1.1 # via flake8 pyhamcrest==1.9.0 pyopenssl==19.1.0 pyparsing==2.4.5 # via packaging +pyrsistent==0.16.0 pytest-cov==2.8.1 pytest==5.3.2 queuelib==1.5.0 @@ -57,7 +59,7 @@ twisted==20.3.0 urllib3==1.25.7 w3lib==1.21.0 wcwidth==0.1.7 # via pytest -zipp==0.6.0 # via importlib-metadata +zipp==3.1.0 zope.interface==4.7.1 # The following packages are considered to be unsafe in a requirements file: From 69f951066693a83091ea795143fbbaa2b3c8bf44 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 15:47:11 -0400 Subject: [PATCH 16/43] Update schema to re use definitions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 47 ++++++++++--------- kingfisher_scrapy/pipelines.py | 1 - 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index b8285ccb9..441b934df 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -14,8 +14,8 @@ } ], "definitions": { - "File": { - "title": "File", + "KingfisherItem": { + "title": "Kingfisher Item", "type": "object", "properties": { "file_name": { @@ -27,7 +27,18 @@ "type": "string", "format": "uri", "minLength": 1 - }, + } + } + }, + "File": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], + "title": "File", + "type": "object", + "properties": { "data": { "type": "string", "minLength": 1 @@ -77,19 +88,14 @@ ] }, "FileItem": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], "title": "File Item", "type": "object", "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, "data": { "type": "string", "minLength": 1 @@ -119,19 +125,14 @@ ] }, "FileError": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherItem" + } + ], "title": "File Error", "type": "object", "properties": { - "file_name": { - "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 - }, - "url": { - "type": "string", - "format": "uri", - "minLength": 1 - }, "errors": { "type": "string", "minLength": 1 diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index e737481e3..4fe2f5879 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -22,6 +22,5 @@ def process_item(self, item, spider): # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. item_str = json.dumps(item.__dict__) json_item = json.loads(item_str)['_values'] - print(json_item) self.validator.validate(json_item) return item From 87f832584e71f9ca1b77c82548a168852eda4859 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:10:57 -0400 Subject: [PATCH 17/43] Add descriptions to schema codelist and correct format Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/dataType.csv | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv index 0e276db01..7d8eac30f 100644 --- a/kingfisher_scrapy/item_schema/dataType.csv +++ b/kingfisher_scrapy/item_schema/dataType.csv @@ -1,16 +1,16 @@ Code,Title,Description -record,Record, -release,Release, -record_list,Record List, -release_list,Release List, -compiled_release,Compiled Release, -record_package,Record Package, -release_package,Release Package, -record_package_list,Record Package List -release_package_list,Release Package List, -record_package_list_in_results,Record Package List in Results, -release_package_list_in_results,Release Package List in Results, -release_package_json_lines,Release Package Json Lines, -record_package_json_lines,Record Package Json Lines, -release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results, -release_in_Release, Release in Release, +record,Record,A record object +release,Release,A release object +record_list,Record List,An array of records +release_list,Release List,An array of releases +compiled_release,Compiled Release,A compiled release +record_package,Record Package,A record package +release_package,Release Package,A release package +record_package_list,Record Package List,An array of record packages +release_package_list,Release Package List,An array of release packages +record_package_list_in_results,Record Package List in Results,An array of record packages inside a results field +release_package_list_in_results,Release Package List in Results,An array of release packages inside a results field +release_package_json_lines,Release Package Json Lines,A json lines file with release packages +record_package_json_lines,Record Package Json Lines,A json lines file with record packages +release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results,A release package inside a ocdsReleasePackage object inside a results array +release_in_Release,Release in Release,A release in a Release object From f87d568f638e41741d5261d2c670e0ee4c5bd172 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:44:50 -0400 Subject: [PATCH 18/43] Update schema to refactor definitions and add titles and descriptions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 85 ++++++++++++------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 441b934df..5b003a0ef 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -19,31 +19,33 @@ "type": "object", "properties": { "file_name": { + "title": "File Name", + "description": "File Name", "type": "string", "pattern": "^[^/]*$", "minLength": 1 }, "url": { + "title": "URL", + "description": "URL", "type": "string", "format": "uri", "minLength": 1 } } }, - "File": { + "KingfisherFileItem": { + "title": "Kingfisher Item", + "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], - "title": "File", - "type": "object", "properties": { - "data": { - "type": "string", - "minLength": 1 - }, "data_type": { + "title": "Data Type", + "description": "Data Type", "type": "string", "enum": [ "record", @@ -67,17 +69,53 @@ "minLength": 1 }, "encoding": { - "type": "string" + "title": "Encoding", + "description": "Encoding", + "type": [ + "string", + "null" + ] }, + "data": { + "title": "Data", + "description": "Data", + "type": "string", + "minLength": 1 + } + } + }, + "File": { + "allOf": [ + { + "$ref": "#/definitions/KingfisherFileItem" + } + ], + "title": "File", + "type": "object", + "properties": { "post_to_api": { - "type": "boolean" + "title": "Post to Api?", + "description": "Post to Api?", + "type": [ + "boolean", + "null" + ] }, "path": { - "type": "string", - "title": "For the KingfisherProcessAPI extension to read the file." + "description": "For the KingfisherProcessAPI extension to read the file.", + "type": [ + "string", + "null" + ], + "title": "Path" }, "files_store": { - "type": "string" + "title": "Files Store", + "description": "Files Store", + "type": [ + "string", + "null" + ] } }, "required": [ @@ -90,27 +128,12 @@ "FileItem": { "allOf": [ { - "$ref": "#/definitions/KingfisherItem" + "$ref": "#/definitions/KingfisherFileItem" } ], "title": "File Item", "type": "object", "properties": { - "data": { - "type": "string", - "minLength": 1 - }, - "data_type": { - "type": "string", - "enum": [ - "release_package", - "record_package" - ], - "minLength": 1 - }, - "encoding": { - "type": "string" - }, "number": { "type": "integer", "minimum": 1 @@ -125,15 +148,17 @@ ] }, "FileError": { + "title": "File Error", + "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], - "title": "File Error", - "type": "object", "properties": { "errors": { + "title": "Errors", + "description": "Errors", "type": "string", "minLength": 1 } From 28a0613dd0b0c02a074a535afabc2c70b85f87f5 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 18 Jun 2020 16:59:46 -0400 Subject: [PATCH 19/43] Update schema descriptions Signed-off-by: Yohanna Lisnichuk --- .../item_schema/item_schema.json | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 5b003a0ef..d25369946 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema Kingfisher Collect File, FileItem and FileError", - "description": "", + "title": "Schema for Kingfisher Collect File, FileItem and FileError", + "description": "Schema for Kingfisher Collect File, FileItem and FileError", "oneOf": [ { "$ref": "#/definitions/File" @@ -16,6 +16,7 @@ "definitions": { "KingfisherItem": { "title": "Kingfisher Item", + "description": "A generic item with file_name and url to be extended by other items", "type": "object", "properties": { "file_name": { @@ -32,10 +33,15 @@ "format": "uri", "minLength": 1 } - } + }, + "required": [ + "file_name", + "url" + ] }, "KingfisherFileItem": { "title": "Kingfisher Item", + "description": "A base object to be extended by other File type items", "type": "object", "allOf": [ { @@ -91,6 +97,7 @@ } ], "title": "File", + "description": "A file object to be send to an API and/or saved to the disk", "type": "object", "properties": { "post_to_api": { @@ -117,13 +124,7 @@ "null" ] } - }, - "required": [ - "file_name", - "url", - "data", - "data_type" - ] + } }, "FileItem": { "allOf": [ @@ -132,6 +133,7 @@ } ], "title": "File Item", + "description": "A file item to be send to an API and not saved to the disk", "type": "object", "properties": { "number": { @@ -140,15 +142,12 @@ } }, "required": [ - "number", - "file_name", - "url", - "data", - "data_type" + "number" ] }, "FileError": { "title": "File Error", + "description": "An item to report and error", "type": "object", "allOf": [ { @@ -164,8 +163,6 @@ } }, "required": [ - "file_name", - "url", "errors" ] } From 6c45dfa9e8e86e4354191dc0f3e109817190bd3c Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 22 Jun 2020 11:46:05 -0400 Subject: [PATCH 20/43] Update item schema with required fields Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item_schema.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index d25369946..050a6d0d5 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -88,7 +88,11 @@ "type": "string", "minLength": 1 } - } + }, + "required": [ + "data", + "data_type" + ] }, "File": { "allOf": [ From b195529b20a557b3a9ff78f8188a198259cacafc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 22 Jun 2020 11:51:27 -0400 Subject: [PATCH 21/43] Update item schema to include number field title and description Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item_schema.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item_schema.json index 050a6d0d5..26f61cb8a 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item_schema.json @@ -141,6 +141,8 @@ "type": "object", "properties": { "number": { + "title": "Item number", + "description": "Item number", "type": "integer", "minimum": 1 } From d95c426f2d5507c59068e6f49bfe94b1377a9986 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:09:19 -0400 Subject: [PATCH 22/43] Add json schema validator requirements, rename schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/dataType.csv | 16 --------- .../{item_schema.json => item.json} | 36 ++++--------------- requirements.in | 6 ++-- requirements.txt | 2 ++ requirements_dev.txt | 2 ++ 5 files changed, 15 insertions(+), 47 deletions(-) delete mode 100644 kingfisher_scrapy/item_schema/dataType.csv rename kingfisher_scrapy/item_schema/{item_schema.json => item.json} (83%) diff --git a/kingfisher_scrapy/item_schema/dataType.csv b/kingfisher_scrapy/item_schema/dataType.csv deleted file mode 100644 index 7d8eac30f..000000000 --- a/kingfisher_scrapy/item_schema/dataType.csv +++ /dev/null @@ -1,16 +0,0 @@ -Code,Title,Description -record,Record,A record object -release,Release,A release object -record_list,Record List,An array of records -release_list,Release List,An array of releases -compiled_release,Compiled Release,A compiled release -record_package,Record Package,A record package -release_package,Release Package,A release package -record_package_list,Record Package List,An array of record packages -release_package_list,Release Package List,An array of release packages -record_package_list_in_results,Record Package List in Results,An array of record packages inside a results field -release_package_list_in_results,Release Package List in Results,An array of release packages inside a results field -release_package_json_lines,Release Package Json Lines,A json lines file with release packages -record_package_json_lines,Record Package Json Lines,A json lines file with record packages -release_package_in_ocdsReleasePackage_in_list_in_results,Release Package in ocdsReleasePackage in List in Results,A release package inside a ocdsReleasePackage object inside a results array -release_in_Release,Release in Release,A release in a Release object diff --git a/kingfisher_scrapy/item_schema/item_schema.json b/kingfisher_scrapy/item_schema/item.json similarity index 83% rename from kingfisher_scrapy/item_schema/item_schema.json rename to kingfisher_scrapy/item_schema/item.json index 26f61cb8a..03dfcfc98 100644 --- a/kingfisher_scrapy/item_schema/item_schema.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -1,18 +1,5 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for Kingfisher Collect File, FileItem and FileError", - "description": "Schema for Kingfisher Collect File, FileItem and FileError", - "oneOf": [ - { - "$ref": "#/definitions/File" - }, - { - "$ref": "#/definitions/FileItem" - }, - { - "$ref": "#/definitions/FileError" - } - ], "definitions": { "KingfisherItem": { "title": "Kingfisher Item", @@ -23,15 +10,13 @@ "title": "File Name", "description": "File Name", "type": "string", - "pattern": "^[^/]*$", - "minLength": 1 + "pattern": "^[^/]+$" }, "url": { "title": "URL", "description": "URL", "type": "string", - "format": "uri", - "minLength": 1 + "format": "uri" } }, "required": [ @@ -69,17 +54,13 @@ "record_package_json_lines", "release_package_in_ocdsReleasePackage_in_list_in_results", "release_in_Release" - ], - "openCodelist": false, - "codelist": "dataType.csv", - "minLength": 1 + ] }, "encoding": { "title": "Encoding", "description": "Encoding", "type": [ - "string", - "null" + "string" ] }, "data": { @@ -108,15 +89,13 @@ "title": "Post to Api?", "description": "Post to Api?", "type": [ - "boolean", - "null" + "boolean" ] }, "path": { "description": "For the KingfisherProcessAPI extension to read the file.", "type": [ - "string", - "null" + "string" ], "title": "Path" }, @@ -124,8 +103,7 @@ "title": "Files Store", "description": "Files Store", "type": [ - "string", - "null" + "string" ] } } diff --git a/requirements.in b/requirements.in index 0300c24d6..5a1de268a 100644 --- a/requirements.in +++ b/requirements.in @@ -2,10 +2,12 @@ # https://github.com/open-contracting/deploy/blob/master/salt/ocdskingfishercollect/scrapyd-requirements.txt jsonpointer +jsonref +jsonschema rarfile requests +rfc3987 Scrapy scrapyd-client ijson>=3 -sentry-sdk -jsonschema \ No newline at end of file +sentry-sdk \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 39e3a499b..65ebc1554 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ ijson==3.0.3 importlib-metadata==1.6.1 # via jsonschema incremental==17.5.0 # via twisted jsonpointer==2.0 +jsonref==0.2 jsonschema==3.2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy @@ -32,6 +33,7 @@ pyrsistent==0.16.0 # via jsonschema queuelib==1.5.0 # via scrapy rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 diff --git a/requirements_dev.txt b/requirements_dev.txt index 06b5256d0..e010b745f 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -25,6 +25,7 @@ importlib-metadata==1.6.1 incremental==17.5.0 isort==4.3.21 jsonpointer==2.0 +jsonref==0.2 jsonschema==3.2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 @@ -50,6 +51,7 @@ pytest==5.3.2 queuelib==1.5.0 rarfile==3.1 requests==2.22.0 +rfc3987==1.3.8 scrapy==1.8.0 scrapyd-client==1.1.0 sentry-sdk==0.14.4 From a7b0ce18368c156aec1e53dadc48b8468b3c64af Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:09:50 -0400 Subject: [PATCH 23/43] Add a schema file per item class Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/file.json | 28 ++++++ kingfisher_scrapy/item_schema/file_error.json | 19 +++++ kingfisher_scrapy/item_schema/file_item.json | 19 +++++ kingfisher_scrapy/item_schema/item.json | 85 ------------------- 4 files changed, 66 insertions(+), 85 deletions(-) create mode 100644 kingfisher_scrapy/item_schema/file.json create mode 100644 kingfisher_scrapy/item_schema/file_error.json create mode 100644 kingfisher_scrapy/item_schema/file_item.json diff --git a/kingfisher_scrapy/item_schema/file.json b/kingfisher_scrapy/item_schema/file.json new file mode 100644 index 000000000..2f755d508 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file.json @@ -0,0 +1,28 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "title": "File", + "type": "object", + "properties": { + "post_to_api": { + "type": [ + "boolean" + ] + }, + "path": { + "description": "For the KingfisherProcessAPI extension to read the file.", + "type": [ + "string" + ] + }, + "files_store": { + "type": [ + "string" + ] + } + } +} \ No newline at end of file diff --git a/kingfisher_scrapy/item_schema/file_error.json b/kingfisher_scrapy/item_schema/file_error.json new file mode 100644 index 000000000..d0ba0ee35 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file_error.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "File Error", + "type": "object", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherItem" + } + ], + "properties": { + "errors": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "errors" + ] +} diff --git a/kingfisher_scrapy/item_schema/file_item.json b/kingfisher_scrapy/item_schema/file_item.json new file mode 100644 index 000000000..4bbbc1199 --- /dev/null +++ b/kingfisher_scrapy/item_schema/file_item.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "allOf": [ + { + "$ref": "item.json#/definitions/KingfisherFileItem" + } + ], + "title": "File Item", + "type": "object", + "properties": { + "number": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "number" + ] +} \ No newline at end of file diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 03dfcfc98..1f3439078 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -7,14 +7,10 @@ "type": "object", "properties": { "file_name": { - "title": "File Name", - "description": "File Name", "type": "string", "pattern": "^[^/]+$" }, "url": { - "title": "URL", - "description": "URL", "type": "string", "format": "uri" } @@ -35,8 +31,6 @@ ], "properties": { "data_type": { - "title": "Data Type", - "description": "Data Type", "type": "string", "enum": [ "record", @@ -57,15 +51,11 @@ ] }, "encoding": { - "title": "Encoding", - "description": "Encoding", "type": [ "string" ] }, "data": { - "title": "Data", - "description": "Data", "type": "string", "minLength": 1 } @@ -74,81 +64,6 @@ "data", "data_type" ] - }, - "File": { - "allOf": [ - { - "$ref": "#/definitions/KingfisherFileItem" - } - ], - "title": "File", - "description": "A file object to be send to an API and/or saved to the disk", - "type": "object", - "properties": { - "post_to_api": { - "title": "Post to Api?", - "description": "Post to Api?", - "type": [ - "boolean" - ] - }, - "path": { - "description": "For the KingfisherProcessAPI extension to read the file.", - "type": [ - "string" - ], - "title": "Path" - }, - "files_store": { - "title": "Files Store", - "description": "Files Store", - "type": [ - "string" - ] - } - } - }, - "FileItem": { - "allOf": [ - { - "$ref": "#/definitions/KingfisherFileItem" - } - ], - "title": "File Item", - "description": "A file item to be send to an API and not saved to the disk", - "type": "object", - "properties": { - "number": { - "title": "Item number", - "description": "Item number", - "type": "integer", - "minimum": 1 - } - }, - "required": [ - "number" - ] - }, - "FileError": { - "title": "File Error", - "description": "An item to report and error", - "type": "object", - "allOf": [ - { - "$ref": "#/definitions/KingfisherItem" - } - ], - "properties": { - "errors": { - "title": "Errors", - "description": "Errors", - "type": "string", - "minLength": 1 - } - }, - "required": [ - "errors" - ] } } } From 1d95007dc4bd0309a6ae6537c1427d3563beb0e9 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:10:55 -0400 Subject: [PATCH 24/43] Update validation method to use a schema per item class Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/items.py | 22 ---------------------- kingfisher_scrapy/pipelines.py | 23 +++++++++++------------ 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index 73da7550e..7ce0e95e5 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -6,7 +6,6 @@ class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() - # indicate that this item should be validated against a schema validate = True @@ -22,13 +21,6 @@ class File(KingfisherItem): path = scrapy.Field() files_store = scrapy.Field() - required = [ - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileItem(KingfisherItem): number = scrapy.Field() @@ -36,20 +28,6 @@ class FileItem(KingfisherItem): data_type = scrapy.Field() encoding = scrapy.Field() - required = [ - 'number', - 'file_name', - 'url', - 'data', - 'data_type', - ] - class FileError(KingfisherItem): errors = scrapy.Field() - - required = [ - 'file_name', - 'url', - 'errors', - ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 4fe2f5879..a477bf14d 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,26 +1,25 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals -import json import os +import pathlib +import jsonref as jsonref from jsonschema import FormatChecker from jsonschema.validators import Draft4Validator class Validate: def __init__(self): - here = os.path.dirname(os.path.abspath(__file__)) - filename = os.path.join(here, 'item_schema', 'item_schema.json') - with open(filename) as f: - schema = json.load(f) - - self.validator = Draft4Validator(schema, format_checker=FormatChecker()) + self.validators = {} + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + for item in ['file', 'file_error', 'file_item']: + filename = os.path.join(schema_path, f'{item}.json') + with open(filename) as f: + schema = jsonref.load(f, base_uri=pathlib.Path(os.path.join(schema_path), 'item_schema').as_uri()) + class_name = ''.join(word.title() for word in item.split('_')) + self.validators[class_name] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): if hasattr(item, 'validate'): - # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't - # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. - item_str = json.dumps(item.__dict__) - json_item = json.loads(item_str)['_values'] - self.validator.validate(json_item) + self.validators.get(item.__class__.__name__).validate(dict(item)) return item From d4079efc86c8c6cee870cfcd40d73b26b23422ac Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:11:16 -0400 Subject: [PATCH 25/43] Add a test per item class Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index da800debc..2d90bb5a0 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File +from kingfisher_scrapy.items import File, FileItem, FileError from kingfisher_scrapy.pipelines import Validate @@ -19,7 +19,67 @@ def test_process_item(): def test_process_item_error(): pipeline = Validate() - item = File() + item = File({ + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + }) + + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['file_name'] = 'test' + item['data_type'] = 'not a valid data type' + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_item(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': 1 + }) + assert pipeline.process_item(item, None) == item + + +def test_process_file_item_error(): + pipeline = Validate() + item = FileItem({ + 'file_name': 'test', + 'data': 'data', + 'data_type': 'release_package', + 'url': 'http://test.com', + 'number': "2" + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['number'] = None + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + + +def test_process_file_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com', + 'errors': 'Error' + }) + assert pipeline.process_item(item, None) == item + +def test_process_file_item_error_error(): + pipeline = Validate() + item = FileError({ + 'file_name': 'test', + 'url': 'http://test.com' + }) + with pytest.raises(ValidationError): + pipeline.process_item(item, None) + item['errors'] = 'Error' + item['url'] = 'not an url' with pytest.raises(ValidationError): pipeline.process_item(item, None) From d7213d3e6b672f951177e2cf5320f1b525623216 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:20:08 -0400 Subject: [PATCH 26/43] isort test_validate.py Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 2d90bb5a0..dd37f5b23 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File, FileItem, FileError +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate From 895e68ea69ff4db8fdde3c737f5547654e439d68 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 23 Jun 2020 09:27:52 -0400 Subject: [PATCH 27/43] Json schemas correct indentation Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/file.json | 2 +- kingfisher_scrapy/item_schema/file_item.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/item_schema/file.json b/kingfisher_scrapy/item_schema/file.json index 2f755d508..05918da3d 100644 --- a/kingfisher_scrapy/item_schema/file.json +++ b/kingfisher_scrapy/item_schema/file.json @@ -25,4 +25,4 @@ ] } } -} \ No newline at end of file +} diff --git a/kingfisher_scrapy/item_schema/file_item.json b/kingfisher_scrapy/item_schema/file_item.json index 4bbbc1199..ab1fcba8d 100644 --- a/kingfisher_scrapy/item_schema/file_item.json +++ b/kingfisher_scrapy/item_schema/file_item.json @@ -16,4 +16,4 @@ "required": [ "number" ] -} \ No newline at end of file +} From d64531135f1621966db625f4ddeadfeb02bdf17d Mon Sep 17 00:00:00 2001 From: aguilerapy <48607824+aguilerapy@users.noreply.github.com> Date: Tue, 23 Jun 2020 18:02:59 -0400 Subject: [PATCH 28/43] Update kingfisher_scrapy/spiders/afghanistan_records.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Romina Fernández --- kingfisher_scrapy/spiders/afghanistan_records.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index 728dba98c..8e15ef5af 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -12,7 +12,7 @@ class AfghanistanRecords(SimpleSpider): https://ocds.ageops.net/ Spider arguments sample - Download only 1 record. + Downloads the first record returned by the record list endpoint """ name = 'afghanistan_records' data_type = 'record' From 0014f9b449be7e7d44696e71c0a7d5ce8e6243af Mon Sep 17 00:00:00 2001 From: aguilerapy <48607824+aguilerapy@users.noreply.github.com> Date: Tue, 23 Jun 2020 18:03:34 -0400 Subject: [PATCH 29/43] Update kingfisher_scrapy/spiders/canada_buyandsell.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Romina Fernández --- kingfisher_scrapy/spiders/canada_buyandsell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index e6a4564b8..196ed8cbe 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -8,7 +8,7 @@ class CanadaBuyAndSell(SimpleSpider): https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data Spider arguments sample - Download only one set of releases. + Downloads a release package with data for the oldest fiscal year available (2013-2014) """ name = 'canada_buyandsell' data_type = 'release_package' From b6df6d491e616120340e76249a035efaf970d4c7 Mon Sep 17 00:00:00 2001 From: aguilerapy <48607824+aguilerapy@users.noreply.github.com> Date: Tue, 23 Jun 2020 18:03:51 -0400 Subject: [PATCH 30/43] Update kingfisher_scrapy/spiders/canada_montreal.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Romina Fernández --- kingfisher_scrapy/spiders/canada_montreal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index ea9335eaf..61e6aaa1e 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -12,7 +12,7 @@ class CanadaMontreal(SimpleSpider): http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api Spider arguments sample - Download only the first page. + Downloads the first page of releases returned by the main endpoint """ name = 'canada_montreal' data_type = 'release_package' From 126c75458c10eba62e9dcd7bcaa901857473329b Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 24 Jun 2020 11:24:46 -0400 Subject: [PATCH 31/43] Update validate pipeline Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/File.json | 1 - kingfisher_scrapy/pipelines.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json index 05918da3d..f07c411d3 100644 --- a/kingfisher_scrapy/item_schema/File.json +++ b/kingfisher_scrapy/item_schema/File.json @@ -14,7 +14,6 @@ ] }, "path": { - "description": "For the KingfisherProcessAPI extension to read the file.", "type": [ "string" ] diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index d0a3e912c..98e34cae7 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -16,11 +16,11 @@ def __init__(self): self.validators = {} self.files = set() self.file_items = set() - schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'item_schema') + schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') for item in ['File', 'FileError', 'FileItem']: filename = os.path.join(schema_path, f'{item}.json') with open(filename) as f: - schema = jsonref.load(f, base_uri=pathlib.Path(schema_path).as_uri()) + schema = jsonref.load(f, base_uri=pathlib.Path(schema_path, 'item_schema').as_uri()) self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): From ef7ed84da0966ef36aec1b2e38a6c1603d4059f1 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 24 Jun 2020 11:25:29 -0400 Subject: [PATCH 32/43] Update changes --- kingfisher_scrapy/spiders/afghanistan_records.py | 2 +- kingfisher_scrapy/spiders/afghanistan_releases.py | 2 +- kingfisher_scrapy/spiders/argentina_vialidad.py | 2 +- kingfisher_scrapy/spiders/canada_buyandsell.py | 2 +- kingfisher_scrapy/spiders/canada_montreal.py | 2 +- kingfisher_scrapy/spiders/dominican_republic.py | 2 +- kingfisher_scrapy/spiders/france.py | 2 +- kingfisher_scrapy/spiders/georgia_records.py | 2 +- kingfisher_scrapy/spiders/georgia_releases.py | 2 +- kingfisher_scrapy/spiders/honduras_cost.py | 2 +- kingfisher_scrapy/spiders/honduras_oncae.py | 2 +- kingfisher_scrapy/spiders/honduras_portal_bulk_files.py | 2 +- kingfisher_scrapy/spiders/indonesia_bandung.py | 2 +- .../spiders/mexico_administracion_publica_federal.py | 2 +- kingfisher_scrapy/spiders/moldova.py | 2 +- kingfisher_scrapy/spiders/moldova_old.py | 2 +- kingfisher_scrapy/spiders/scotland.py | 2 +- kingfisher_scrapy/spiders/uk_contracts_finder.py | 2 +- kingfisher_scrapy/spiders/uk_fts.py | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index 84878c56b..9e5060e7d 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -12,7 +12,7 @@ class AfghanistanRecords(SimpleSpider): https://ocds.ageops.net/ Spider arguments sample - Downloads the first record returned by the record list endpoint + Downloads the first record returned by the record list endpoint. """ name = 'afghanistan_records' data_type = 'record' diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 1717983a1..7257cacaf 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -12,7 +12,7 @@ class AfghanistanReleases(SimpleSpider): https://ocds.ageops.net/ Spider arguments sample - Download only 1 release. + Downloads the first release returned by the release endpoint of the API. """ name = 'afghanistan_releases' data_type = 'release' diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 743e67086..385ff6652 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -9,7 +9,7 @@ class ArgentinaVialidad(SimpleSpider): https://datosabiertos.vialidad.gob.ar/ui/index.html#!/datos_abiertos Spider arguments sample - Download one set of releases. + Ignored, data is downloaded from a single JSON file. """ name = 'argentina_vialidad' data_type = 'release_package_list' diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index 196ed8cbe..416f78d61 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -8,7 +8,7 @@ class CanadaBuyAndSell(SimpleSpider): https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data Spider arguments sample - Downloads a release package with data for the oldest fiscal year available (2013-2014) + Downloads a release package with data for the oldest fiscal year available (2013-2014). """ name = 'canada_buyandsell' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index cddfc87d7..39778c882 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -12,7 +12,7 @@ class CanadaMontreal(SimpleSpider): http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api Spider arguments sample - Downloads the first page of releases returned by the main endpoint + Downloads the first page of releases returned by the main endpoint. """ name = 'canada_montreal' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index deaedc633..85aa10b45 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -14,7 +14,7 @@ class DominicanRepublic(BaseSpider): https://www.dgcp.gob.do/estandar-mundial-ocds/ Spider arguments sample - Download only one set of releases. + Downloads a release package for the oldest year (2018, first link in the downloads page). """ name = 'dominican_republic' diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index e7875a3b8..bb3586589 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -12,7 +12,7 @@ class France(SimpleSpider): https://doc.data.gouv.fr/api/reference/ Spider arguments sample - Download one set of releases. + Downloads the first OCDS package found using the CKAN API. """ name = 'france' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index 52595b629..aa5d28c00 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -10,7 +10,7 @@ class GeorgiaRecords(LinksSpider): https://odapi.spa.ge/api/swagger.ui Spider arguments sample - Download one set of releases. + Downloads the first page of packages returned by the record list endpoint. """ name = 'georgia_records' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index c7da12b36..6f10324d8 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -10,7 +10,7 @@ class GeorgiaReleases(LinksSpider): https://odapi.spa.ge/api/swagger.ui Spider arguments sample - Download one set of releases. + Downloads the first page of packages returned by the release list endpoint. """ name = 'georgia_releases' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index af8c62cd5..d58c9ad45 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -7,7 +7,7 @@ class HondurasCoST(SimpleSpider): """ Spider arguments sample - Download only 20 records. + Ignored, a single file is downloaded. """ name = 'honduras_cost' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index b8b4addfc..8ee680ac5 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -10,7 +10,7 @@ class HondurasONCAE(ZipSpider): http://oncae.gob.hn/datosabiertos Spider arguments sample - Download one set of releases. + Downloads the first package listed on the downloads page. """ name = 'honduras_oncae' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 8304993e0..2ba869669 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -12,7 +12,7 @@ class HondurasPortalBulkFiles(SimpleSpider): http://www.contratacionesabiertas.gob.hn/descargas/ Spider arguments sample - Download one set of releases. + Downloads the first package listed in http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json. """ name = 'honduras_portal_bulk_files' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index 430266e31..736549691 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -9,7 +9,7 @@ class IndonesiaBandung(BaseSpider): """ Spider arguments sample - Download only 10 releases. + Downloads the first release listed for 2013 """ name = 'indonesia_bandung' diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 59054cb5c..5183c5fe0 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -13,7 +13,7 @@ class MexicoAdministracionPublicaFederal(SimpleSpider): https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf Spider arguments sample - Download only 100 records. + Downloads the records on the first page of the list endpoint. """ name = 'mexico_administracion_publica_federal' data_type = 'record_package_list_in_results' diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index 467755dd7..04d9698a5 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -8,7 +8,7 @@ class Moldova(SimpleSpider): """ Spider arguments sample - Download only one set of records. + Downloads the first page of records for each available endpoint (budgets, tenders). """ name = 'moldova' data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index db6ac689a..938b1113e 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -8,7 +8,7 @@ class MoldovaOld(SimpleSpider): http://opencontracting.date.gov.md/downloads Spider arguments sample - Download only data released on 2017. + Downloads a single JSON file containing data for 2017. """ name = 'moldova_old' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index e054f5213..b35330105 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -10,7 +10,7 @@ class Scotland(SimpleSpider): https://api.publiccontractsscotland.gov.uk/v1 Spider arguments sample - Download only 21 releases. + Downloads packages for releases dated one year ago, for each notice type available. """ name = 'scotland' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index e8632ba8f..5fd8407c7 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -8,7 +8,7 @@ class UKContractsFinder(SimpleSpider): """ Spider arguments sample - Download only 100 release packages. + Downloads the first page of release packages returned by the main endpoint. """ name = 'uk_contracts_finder' data_type = 'release_package_list_in_results' diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index d5073e49f..a26c32d49 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -8,7 +8,7 @@ class UKContractsFinder(LinksSpider): """ Spider arguments sample - Download only 1 release package. + Downloads the first release package returned by the main endpoint. """ name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' From 633bc1cca0f3d363cc72b8a4ea714549eccf66da Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 24 Jun 2020 11:33:39 -0400 Subject: [PATCH 33/43] isort test_validate.py Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 8675b6aae..9dcbca083 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,8 +1,7 @@ import pytest from jsonschema import ValidationError -from kingfisher_scrapy.items import File, FileItem -from kingfisher_scrapy.items import FileError +from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.pipelines import Validate from tests import spider_with_crawler From be2da56988e5e7d0c52da933581dd50bb2de12b0 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 24 Jun 2020 12:17:59 -0400 Subject: [PATCH 34/43] Remove remaining title/description metadata properties. Change "type" values from arrays to strings. --- kingfisher_scrapy/item_schema/File.json | 13 +++---------- kingfisher_scrapy/item_schema/FileError.json | 3 +-- kingfisher_scrapy/item_schema/FileItem.json | 1 - kingfisher_scrapy/item_schema/item.json | 10 ++-------- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json index f07c411d3..c726798e4 100644 --- a/kingfisher_scrapy/item_schema/File.json +++ b/kingfisher_scrapy/item_schema/File.json @@ -5,23 +5,16 @@ "$ref": "item.json#/definitions/KingfisherFileItem" } ], - "title": "File", "type": "object", "properties": { "post_to_api": { - "type": [ - "boolean" - ] + "type": "boolean" }, "path": { - "type": [ - "string" - ] + "type": "string" }, "files_store": { - "type": [ - "string" - ] + "type": "string" } } } diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json index d0ba0ee35..8f1b935d0 100644 --- a/kingfisher_scrapy/item_schema/FileError.json +++ b/kingfisher_scrapy/item_schema/FileError.json @@ -1,12 +1,11 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "title": "File Error", - "type": "object", "allOf": [ { "$ref": "item.json#/definitions/KingfisherItem" } ], + "type": "object", "properties": { "errors": { "type": "string", diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json index ab1fcba8d..3e49413fb 100644 --- a/kingfisher_scrapy/item_schema/FileItem.json +++ b/kingfisher_scrapy/item_schema/FileItem.json @@ -5,7 +5,6 @@ "$ref": "item.json#/definitions/KingfisherFileItem" } ], - "title": "File Item", "type": "object", "properties": { "number": { diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 1f3439078..9d96c7f10 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -2,8 +2,6 @@ "$schema": "http://json-schema.org/draft-04/schema#", "definitions": { "KingfisherItem": { - "title": "Kingfisher Item", - "description": "A generic item with file_name and url to be extended by other items", "type": "object", "properties": { "file_name": { @@ -21,14 +19,12 @@ ] }, "KingfisherFileItem": { - "title": "Kingfisher Item", - "description": "A base object to be extended by other File type items", - "type": "object", "allOf": [ { "$ref": "#/definitions/KingfisherItem" } ], + "type": "object", "properties": { "data_type": { "type": "string", @@ -51,9 +47,7 @@ ] }, "encoding": { - "type": [ - "string" - ] + "type": "string" }, "data": { "type": "string", From 74ac039f8b6cb91f9385ecfd9ec651cda2efa9e9 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 24 Jun 2020 12:24:45 -0400 Subject: [PATCH 35/43] pipelines: Add trailing slash to URI so that last component is not removed during dereferencing --- kingfisher_scrapy/pipelines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 98e34cae7..68172e696 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -17,10 +17,10 @@ def __init__(self): self.files = set() self.file_items = set() schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema') - for item in ['File', 'FileError', 'FileItem']: + for item in ('File', 'FileError', 'FileItem'): filename = os.path.join(schema_path, f'{item}.json') with open(filename) as f: - schema = jsonref.load(f, base_uri=pathlib.Path(schema_path, 'item_schema').as_uri()) + schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/') self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker()) def process_item(self, item, spider): From f690412b8909bbeba5e28c3e7a9941ef0b9daeb8 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 11:40:07 -0400 Subject: [PATCH 36/43] Remove string type from schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 9d96c7f10..f2d953276 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -50,8 +50,6 @@ "type": "string" }, "data": { - "type": "string", - "minLength": 1 } }, "required": [ From 726196012d43d8c1c4e43d967ffceffc1301c2b1 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 11:40:25 -0400 Subject: [PATCH 37/43] Fix colombia file name and check parameters Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/colombia.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index dbac11d97..cda2d640a 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -29,14 +29,21 @@ class Colombia(LinksSpider): If ``from_date`` is provided and ``until_date`` don't, defaults to today. """ name = 'colombia' - next_page_formatter = staticmethod(parameters('page')) + next_page_formatter = staticmethod(parameters('_id')) default_from_date = '2011-01-01' + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super(Colombia, cls).from_crawler(crawler, date_format='date', *args, **kwargs) + if (spider.from_date or spider.until_date) and hasattr(spider, 'year'): + raise scrapy.exceptions.CloseSpider('The use of from and/or until with year parameter is not supported') + return spider + def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' if hasattr(self, 'year'): base_url += f'/page/{int(self.year)}' - if self.from_date or self.until_date: + elif self.from_date or self.until_date: from_date = self.from_date.strftime(self.date_format) until_date = self.until_date.strftime(self.date_format) base_url += f'/dates/{from_date}/{until_date}' From cb3b85abe9f39e5d9cdd30ec3792a8fc70ce4ca2 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 11:42:37 -0400 Subject: [PATCH 38/43] Correct item schema indent Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index f2d953276..3ace3c218 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -49,8 +49,7 @@ "encoding": { "type": "string" }, - "data": { - } + "data": {} }, "required": [ "data", From bd3ec1707873074c91f369c244b2b2f8a2dd341b Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 12:48:03 -0400 Subject: [PATCH 39/43] Update colombia parameters validation message Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/colombia.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index cda2d640a..e4eb357de 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -34,9 +34,10 @@ class Colombia(LinksSpider): @classmethod def from_crawler(cls, crawler, *args, **kwargs): - spider = super(Colombia, cls).from_crawler(crawler, date_format='date', *args, **kwargs) + spider = super().from_crawler(crawler, *args, **kwargs) if (spider.from_date or spider.until_date) and hasattr(spider, 'year'): - raise scrapy.exceptions.CloseSpider('The use of from and/or until with year parameter is not supported') + raise scrapy.exceptions.CloseSpider('You cannot specify both a year spider argument and ' + 'from_date/until_date spider argument(s).') return spider def start_requests(self): From d5c3a6ebe257cfc03bac7ef5449dd397db708184 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 12:48:34 -0400 Subject: [PATCH 40/43] Add minLength to data field in item schema Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/item_schema/item.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json index 3ace3c218..7ded5a014 100644 --- a/kingfisher_scrapy/item_schema/item.json +++ b/kingfisher_scrapy/item_schema/item.json @@ -49,7 +49,9 @@ "encoding": { "type": "string" }, - "data": {} + "data": { + "minLength": 1 + } }, "required": [ "data", From 8248c3dedb4a7eae05c861772fc18aee48cc346f Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 25 Jun 2020 12:49:05 -0400 Subject: [PATCH 41/43] Add test for items with bytes Signed-off-by: Yohanna Lisnichuk --- tests/test_validate.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_validate.py b/tests/test_validate.py index 9dcbca083..6233a1799 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -17,6 +17,11 @@ def test_process_item(): assert pipeline.process_item(item, None) == item + item['data'] = item['data'].encode('ascii') + item['file_name'] = 'test2' + + assert pipeline.process_item(item, None) == item + def test_process_item_error(): pipeline = Validate() From 9cfd3176952ce92d331963174ec30b5971920943 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 25 Jun 2020 16:57:44 -0400 Subject: [PATCH 42/43] Add minimal MANIFEST.in to include JSON files --- MANIFEST.in | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..973bfe0d9 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include scrapy.cfg +recursive-include kingfisher_scrapy *.json +recursive-include kingfisher_scrapy *.py From e9b2c0c720e780dcf9e914e910740770ff1fc0aa Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 25 Jun 2020 17:16:16 -0400 Subject: [PATCH 43/43] Use setup.py instead of MANIFEST.in --- MANIFEST.in | 3 --- setup.py | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 973bfe0d9..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include scrapy.cfg -recursive-include kingfisher_scrapy *.json -recursive-include kingfisher_scrapy *.py diff --git a/setup.py b/setup.py index ad683d5b7..784a06960 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,10 @@ 'kingfisher_scrapy', 'kingfisher_scrapy.spiders', ], + package_data={ + 'kingfisher_scrapy': ['item_schema/*.json'], + }, + include_package_data=True, entry_points={ 'scrapy': [ 'settings = kingfisher_scrapy.settings',