Skip to content

Commit

Permalink
Merge branch 'master' into 423-update-indonesia-doc
Browse files Browse the repository at this point in the history
# Conflicts:
#	kingfisher_scrapy/spiders/indonesia_bandung.py
  • Loading branch information
aguilerapy committed Jun 25, 2020
2 parents f5eb709 + e9b2c0c commit 3afeb58
Show file tree
Hide file tree
Showing 33 changed files with 368 additions and 65 deletions.
4 changes: 0 additions & 4 deletions kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,5 @@ class SpiderArgumentError(KingfisherScrapyError):
"""Raised when a spider argument's value is invalid"""


class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
"""Raised when an item is missing a required field"""


class MissingNextLinkError(KingfisherScrapyError):
"""Raised when a next link is not found on the first page of results"""
20 changes: 20 additions & 0 deletions kingfisher_scrapy/item_schema/File.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"type": "object",
"properties": {
"post_to_api": {
"type": "boolean"
},
"path": {
"type": "string"
},
"files_store": {
"type": "string"
}
}
}
18 changes: 18 additions & 0 deletions kingfisher_scrapy/item_schema/FileError.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherItem"
}
],
"type": "object",
"properties": {
"errors": {
"type": "string",
"minLength": 1
}
},
"required": [
"errors"
]
}
18 changes: 18 additions & 0 deletions kingfisher_scrapy/item_schema/FileItem.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"allOf": [
{
"$ref": "item.json#/definitions/KingfisherFileItem"
}
],
"type": "object",
"properties": {
"number": {
"type": "integer",
"minimum": 1
}
},
"required": [
"number"
]
}
62 changes: 62 additions & 0 deletions kingfisher_scrapy/item_schema/item.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"definitions": {
"KingfisherItem": {
"type": "object",
"properties": {
"file_name": {
"type": "string",
"pattern": "^[^/]+$"
},
"url": {
"type": "string",
"format": "uri"
}
},
"required": [
"file_name",
"url"
]
},
"KingfisherFileItem": {
"allOf": [
{
"$ref": "#/definitions/KingfisherItem"
}
],
"type": "object",
"properties": {
"data_type": {
"type": "string",
"enum": [
"record",
"release",
"record_list",
"release_list",
"compiled_release",
"record_package",
"release_package",
"record_package_list",
"release_package_list",
"record_package_list_in_results",
"release_package_list_in_results",
"release_package_json_lines",
"record_package_json_lines",
"release_package_in_ocdsReleasePackage_in_list_in_results",
"release_in_Release"
]
},
"encoding": {
"type": "string"
},
"data": {
"minLength": 1
}
},
"required": [
"data",
"data_type"
]
}
}
}
36 changes: 2 additions & 34 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,12 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy

from kingfisher_scrapy.exceptions import MissingRequiredFieldError
import scrapy


class KingfisherItem(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()

def validate(self):
"""
Raises an error if any required field is missing.
:raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
"""
if hasattr(self, 'required'):
for field in self.required:
if field not in self:
raise MissingRequiredFieldError(field)
validate = True


class File(KingfisherItem):
Expand All @@ -32,34 +21,13 @@ class File(KingfisherItem):
path = scrapy.Field()
files_store = scrapy.Field()

required = [
'file_name',
'url',
'data',
'data_type',
]


class FileItem(KingfisherItem):
number = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()

required = [
'number',
'file_name',
'url',
'data',
'data_type',
]


class FileError(KingfisherItem):
errors = scrapy.Field()

required = [
'file_name',
'url',
'errors',
]
19 changes: 16 additions & 3 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals

import os
import pathlib

import jsonref as jsonref
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator

from kingfisher_scrapy.items import File, FileItem


class Validate:
def __init__(self):
self.validators = {}
self.files = set()
self.file_items = set()
schema_path = pathlib.Path(os.path.dirname(os.path.abspath(__file__)), 'item_schema')
for item in ('File', 'FileError', 'FileItem'):
filename = os.path.join(schema_path, f'{item}.json')
with open(filename) as f:
schema = jsonref.load(f, base_uri=schema_path.as_uri() + '/')
self.validators[item] = Draft4Validator(schema, format_checker=FormatChecker())

def process_item(self, item, spider):
if hasattr(item, 'validate'):
# We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
# as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
item.validate()
self.validators.get(item.__class__.__name__).validate(dict(item))

if isinstance(item, FileItem):
key = (item['file_name'], item['number'])
Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@


class AfghanistanRecords(SimpleSpider):
"""
API documentation
https://ocds.ageops.net/
Spider arguments
sample
Downloads the first record returned by the record list endpoint.
"""
name = 'afghanistan_records'
data_type = 'record'

Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/afghanistan_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@


class AfghanistanReleases(SimpleSpider):
"""
API documentation
https://ocds.ageops.net/
Spider arguments
sample
Downloads the first release returned by the release endpoint of the API.
"""
name = 'afghanistan_releases'
data_type = 'release'

Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

class ArgentinaBuenosAires(ZipSpider):
"""
Bulk download documentation
https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
API documentation
https://data.buenosaires.gob.ar/acerca/ckan
Bulk download documentation
https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
Spider arguments
sample
Downloads the zip file and sends 10 releases to kingfisher process.
Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/argentina_vialidad.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@


class ArgentinaVialidad(SimpleSpider):
"""
API documentation
https://datosabiertos.vialidad.gob.ar/ui/index.html#!/datos_abiertos
Spider arguments
sample
Ignored, data is downloaded from a single JSON file.
"""
name = 'argentina_vialidad'
data_type = 'release_package_list'

Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/canada_buyandsell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@


class CanadaBuyAndSell(SimpleSpider):
"""
API documentation
https://buyandsell.gc.ca/procurement-data/open-contracting-data-standard-pilot/download-ocds-pilot-data
Spider arguments
sample
Downloads a release package with data for the oldest fiscal year available (2013-2014).
"""
name = 'canada_buyandsell'
data_type = 'release_package'

Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/canada_montreal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@


class CanadaMontreal(SimpleSpider):
"""
API documentation
http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api
Spider arguments
sample
Downloads the first page of releases returned by the main endpoint.
"""
name = 'canada_montreal'
data_type = 'release_package'
step = 10000
Expand Down
12 changes: 10 additions & 2 deletions kingfisher_scrapy/spiders/colombia.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,22 @@ class Colombia(LinksSpider):
If ``from_date`` is provided and ``until_date`` don't, defaults to today.
"""
name = 'colombia'
next_page_formatter = staticmethod(parameters('page'))
next_page_formatter = staticmethod(parameters('_id'))
default_from_date = '2011-01-01'

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
if (spider.from_date or spider.until_date) and hasattr(spider, 'year'):
raise scrapy.exceptions.CloseSpider('You cannot specify both a year spider argument and '
'from_date/until_date spider argument(s).')
return spider

def start_requests(self):
base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
if hasattr(self, 'year'):
base_url += f'/page/{int(self.year)}'
if self.from_date or self.until_date:
elif self.from_date or self.until_date:
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
base_url += f'/dates/{from_date}/{until_date}'
Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@


class DominicanRepublic(BaseSpider):
"""
Bulk download documentation
https://www.dgcp.gob.do/estandar-mundial-ocds/
Spider arguments
sample
Downloads a release package for the oldest year (2018, first link in the downloads page).
"""
name = 'dominican_republic'

def start_requests(self):
Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/france.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@


class France(SimpleSpider):
"""
Swagger API documentation
https://doc.data.gouv.fr/api/reference/
Spider arguments
sample
Downloads the first OCDS package found using the CKAN API.
"""
name = 'france'
data_type = 'release_package'

Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/georgia_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@


class GeorgiaRecords(LinksSpider):
"""
Swagger API documentation
https://odapi.spa.ge/api/swagger.ui
Spider arguments
sample
Downloads the first page of packages returned by the record list endpoint.
"""
name = 'georgia_records'
data_type = 'record_package'
next_page_formatter = staticmethod(parameters('page'))
Expand Down
7 changes: 7 additions & 0 deletions kingfisher_scrapy/spiders/georgia_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@


class GeorgiaReleases(LinksSpider):
"""
Swagger API documentation
https://odapi.spa.ge/api/swagger.ui
Spider arguments
sample
Downloads the first page of packages returned by the release list endpoint.
"""
name = 'georgia_releases'
data_type = 'release_package'
next_page_formatter = staticmethod(parameters('page'))
Expand Down
Loading

0 comments on commit 3afeb58

Please sign in to comment.