Merge branch 'master' into 423-update-indonesia-doc

# Conflicts: # kingfisher_scrapy/spiders/indonesia_bandung.py
open-contracting · Jun 25, 2020 · 3afeb58 · 3afeb58
2 parents f5eb709 + e9b2c0c
commit 3afeb58
Show file tree

Hide file tree

Showing 33 changed files with 368 additions and 65 deletions.
diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py
@@ -10,9 +10,5 @@ class SpiderArgumentError(KingfisherScrapyError):
     """Raised when a spider argument's value is invalid"""
 
 
-class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
-    """Raised when an item is missing a required field"""
-
-
 class MissingNextLinkError(KingfisherScrapyError):
     """Raised when a next link is not found on the first page of results"""
diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json
@@ -0,0 +1,20 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "allOf": [
+    {
+      "$ref": "item.json#/definitions/KingfisherFileItem"
+    }
+  ],
+  "type": "object",
+  "properties": {
+    "post_to_api": {
+      "type": "boolean"
+    },
+    "path": {
+      "type": "string"
+    },
+    "files_store": {
+      "type": "string"
+    }
+  }
+}
diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json
@@ -0,0 +1,18 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "allOf": [
+    {
+      "$ref": "item.json#/definitions/KingfisherItem"
+    }
+  ],
+  "type": "object",
+  "properties": {
+    "errors": {
+      "type": "string",
+      "minLength": 1
+    }
+  },
+  "required": [
+    "errors"
+  ]
+}
diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json
@@ -0,0 +1,18 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "allOf": [
+    {
+      "$ref": "item.json#/definitions/KingfisherFileItem"
+    }
+  ],
+  "type": "object",
+  "properties": {
+    "number": {
+      "type": "integer",
+      "minimum": 1
+    }
+  },
+  "required": [
+    "number"
+  ]
+}
diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json
@@ -0,0 +1,62 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "definitions": {
+    "KingfisherItem": {
+      "type": "object",
+      "properties": {
+        "file_name": {
+          "type": "string",
+          "pattern": "^[^/]+$"
+        },
+        "url": {
+          "type": "string",
+          "format": "uri"
+        }
+      },
+      "required": [
+        "file_name",
+        "url"
+      ]
+    },
+    "KingfisherFileItem": {
+      "allOf": [
+        {
+          "$ref": "#/definitions/KingfisherItem"
+        }
+      ],
+      "type": "object",
+      "properties": {
+        "data_type": {
+          "type": "string",
+          "enum": [
+            "record",
+            "release",
+            "record_list",
+            "release_list",
+            "compiled_release",
+            "record_package",
+            "release_package",
+            "record_package_list",
+            "release_package_list",
+            "record_package_list_in_results",
+            "release_package_list_in_results",
+            "release_package_json_lines",
+            "record_package_json_lines",
+            "release_package_in_ocdsReleasePackage_in_list_in_results",
+            "release_in_Release"
+          ]
+        },
+        "encoding": {
+          "type": "string"
+        },
+        "data": {
+          "minLength": 1
+        }
+      },
+      "required": [
+        "data",
+        "data_type"
+      ]
+    }
+  }
+}
diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py
@@ -1,23 +1,12 @@
 # https://docs.scrapy.org/en/latest/topics/items.html
-import scrapy
 
-from kingfisher_scrapy.exceptions import MissingRequiredFieldError
+import scrapy
 
 
 class KingfisherItem(scrapy.Item):
     file_name = scrapy.Field()
     url = scrapy.Field()
-
-    def validate(self):
-        """
-        Raises an error if any required field is missing.
-
-        :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
-        """
-        if hasattr(self, 'required'):
-            for field in self.required:
-                if field not in self:
-                    raise MissingRequiredFieldError(field)
+    validate = True
 
 
 class File(KingfisherItem):
@@ -32,34 +21,13 @@ class File(KingfisherItem):
     path = scrapy.Field()
     files_store = scrapy.Field()
 
-    required = [
-        'file_name',
-        'url',
-        'data',
-        'data_type',
-    ]
-
 
 class FileItem(KingfisherItem):
     number = scrapy.Field()
     data = scrapy.Field()
     data_type = scrapy.Field()
     encoding = scrapy.Field()
 
-    required = [
-        'number',
-        'file_name',
-        'url',
-        'data',
-        'data_type',
-    ]
-
 
 class FileError(KingfisherItem):
     errors = scrapy.Field()
-
-    required = [
-        'file_name',
-        'url',
-        'errors',
-    ]
diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
@@ -1,18 +1,31 @@
 # https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
+
+import os
+import pathlib
+
+import jsonref as jsonref
+from jsonschema import FormatChecker
+from jsonschema.validators import Draft4Validator
+
 from kingfisher_scrapy.items import File, FileItem
 
 
 class Validate:
     def __init__(self):
+        self.validators = {}
         self.files = set()
         self.file_items = set()
+        schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema')
+        for item in ('File', 'FileError', 'FileItem'):
+            filename = os.path.join(schema_path, f'{item}.json')
+            with open(filename) as f:
+                schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/')
+            self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker())
 
     def process_item(self, item, spider):
         if hasattr(item, 'validate'):
-            # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
-            # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
-            item.validate()
+            self.validators.get(item.__class__.__name__).validate(dict(item))
 
         if isinstance(item, FileItem):
             key = (item['file_name'], item['number'])

diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py
@@ -7,6 +7,13 @@
 
 
 class AfghanistanRecords(SimpleSpider):
+    """
+    API documentation
+      https://ocds.ageops.net/
+    Spider arguments
+      sample
+        Downloads the first record returned by the record list endpoint.
+    """
     name = 'afghanistan_records'
     data_type = 'record'
 

diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py
@@ -7,6 +7,13 @@
 
 
 class AfghanistanReleases(SimpleSpider):
+    """
+    API documentation
+      https://ocds.ageops.net/
+    Spider arguments
+      sample
+        Downloads the first release returned by the release endpoint of the API.
+    """
     name = 'afghanistan_releases'
     data_type = 'release'
 

diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py
@@ -8,10 +8,10 @@
 
 class ArgentinaBuenosAires(ZipSpider):
     """
-    Bulk download documentation
-      https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
     API documentation
       https://data.buenosaires.gob.ar/acerca/ckan
+    Bulk download documentation
+      https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
     Spider arguments
       sample
         Downloads the zip file and sends 10 releases to kingfisher process.

diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py
@@ -4,6 +4,13 @@
 
 
 class ArgentinaVialidad(SimpleSpider):
+    """
+    API documentation
+      https://datosabiertos.vialidad.gob.ar/ui/index.html#!/datos_abiertos
+    Spider arguments
+      sample
+        Ignored, data is downloaded from a single JSON file.
+    """
     name = 'argentina_vialidad'
     data_type = 'release_package_list'
 

diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py
@@ -3,6 +3,13 @@
 
 
 class CanadaBuyAndSell(SimpleSpider):
+    """
+    API documentation
+      https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data
+    Spider arguments
+      sample
+        Downloads a release package with data for the oldest fiscal year available (2013-2014).
+    """
     name = 'canada_buyandsell'
     data_type = 'release_package'
 

diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py
@@ -7,6 +7,13 @@
 
 
 class CanadaMontreal(SimpleSpider):
+    """
+    API documentation
+      http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api
+    Spider arguments
+      sample
+        Downloads the first page of releases returned by the main endpoint.
+    """
     name = 'canada_montreal'
     data_type = 'release_package'
     step = 10000

diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py
@@ -29,14 +29,22 @@ class Colombia(LinksSpider):
         If ``from_date`` is provided and ``until_date`` don't, defaults to today.
     """
     name = 'colombia'
-    next_page_formatter = staticmethod(parameters('page'))
+    next_page_formatter = staticmethod(parameters('_id'))
     default_from_date = '2011-01-01'
 
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super().from_crawler(crawler, *args, **kwargs)
+        if (spider.from_date or spider.until_date) and hasattr(spider, 'year'):
+            raise scrapy.exceptions.CloseSpider('You cannot specify both a year spider argument and '
+                                                'from_date/until_date spider argument(s).')
+        return spider
+
     def start_requests(self):
         base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
         if hasattr(self, 'year'):
             base_url += f'/page/{int(self.year)}'
-        if self.from_date or self.until_date:
+        elif self.from_date or self.until_date:
             from_date = self.from_date.strftime(self.date_format)
             until_date = self.until_date.strftime(self.date_format)
             base_url += f'/dates/{from_date}/{until_date}'

diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -9,6 +9,13 @@
 
 
 class DominicanRepublic(BaseSpider):
+    """
+    Bulk download documentation
+      https://www.dgcp.gob.do/estandar-mundial-ocds/
+    Spider arguments
+      sample
+        Downloads a release package for the oldest year (2018, first link in the downloads page).
+    """
     name = 'dominican_republic'
 
     def start_requests(self):

diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py
@@ -7,6 +7,13 @@
 
 
 class France(SimpleSpider):
+    """
+    Swagger API documentation
+      https://doc.data.gouv.fr/api/reference/
+    Spider arguments
+      sample
+        Downloads the first OCDS package found using the CKAN API.
+    """
     name = 'france'
     data_type = 'release_package'
 

diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py
@@ -5,6 +5,13 @@
 
 
 class GeorgiaRecords(LinksSpider):
+    """
+    Swagger API documentation
+      https://odapi.spa.ge/api/swagger.ui
+    Spider arguments
+      sample
+        Downloads the first page of packages returned by the record list endpoint.
+    """
     name = 'georgia_records'
     data_type = 'record_package'
     next_page_formatter = staticmethod(parameters('page'))

diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py
@@ -5,6 +5,13 @@
 
 
 class GeorgiaReleases(LinksSpider):
+    """
+    Swagger API documentation
+      https://odapi.spa.ge/api/swagger.ui
+    Spider arguments
+      sample
+        Downloads the first page of packages returned by the release list endpoint.
+    """
     name = 'georgia_releases'
     data_type = 'release_package'
     next_page_formatter = staticmethod(parameters('page'))