Skip to content

Commit

Permalink
Merge pull request #524 from open-contracting/490-csv-bolivia
Browse files Browse the repository at this point in the history
Add FlattenSpider and Bolivia Agetic spider
  • Loading branch information
yolile committed Nov 11, 2020
2 parents 843f77f + 05b318e commit 8754249
Show file tree
Hide file tree
Showing 10 changed files with 206 additions and 11 deletions.
10 changes: 10 additions & 0 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,16 @@ Australia
scrapy crawl australia_nsw
Bolivia
-------

.. autoclass:: kingfisher_scrapy.spiders.bolivia_agetic.BoliviaAgetic
:no-members:

.. code-block:: bash
scrapy crawl bolivia_agetic
Canada
------

Expand Down
4 changes: 3 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class BaseSpider(scrapy.Spider):
- If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
``default_from_date`` class attribute to a date string.
- If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
- If a spider collect data from CSV or XLSX files, add a ``unflatten = True`` class attribute to process each item
in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
``get_default_until_date()`` return value (which is the current time, by default).
Expand All @@ -39,6 +40,7 @@ class BaseSpider(scrapy.Spider):
ocds_version = '1.1'
date_format = 'date'
date_required = False
unflatten = False

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
Expand Down
54 changes: 54 additions & 0 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
import json
import os
import pkgutil
import tempfile

import jsonpointer
from flattentool import unflatten
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator, RefResolver
from ocdsmerge.util import get_release_schema_url, get_tags
from scrapy.exceptions import DropItem

from kingfisher_scrapy.items import File, FileItem, PluckedItem
Expand Down Expand Up @@ -106,6 +110,56 @@ def process_item(self, item, spider):
return PluckedItem({'value': value})


class Unflatten:
def process_item(self, item, spider):
if not spider.unflatten or not isinstance(item, (File, FileItem)):
return item

input_name = item['file_name']
if input_name.endswith('.csv'):
item['file_name'] = item['file_name'][:-4] + '.json'
input_format = 'csv'
elif input_name.endswith('.xlsx'):
item['file_name'] = item['file_name'][:-5] + '.json'
input_format = 'xlsx'
else:
raise NotImplementedError(f"the file '{input_name}' has no extension or is not CSV or XLSX, "
f"obtained from: {item['url']}")

spider_ocds_version = spider.ocds_version.replace('.', '__')
for tag in reversed(get_tags()):
if tag.startswith(spider_ocds_version):
schema = get_release_schema_url(tag)
break
else:
raise NotImplementedError(f"no schema found for '{spider_ocds_version}'")

with tempfile.TemporaryDirectory() as directory:
input_path = os.path.join(directory, input_name)
output_name = os.path.join(directory, item['file_name'])
if input_format == 'csv':
input_name = directory
elif input_format == 'xlsx':
input_name = input_path

with open(input_path, 'wb') as f:
f.write(item['data'])

unflatten(
input_name,
root_list_path='releases',
root_id='ocid',
schema=schema,
input_format=input_format,
output_name=output_name
)

with open(output_name, 'r') as f:
item['data'] = f.read()

return item


def _resolve_pointer(data, pointer):
try:
return jsonpointer.resolve_pointer(data, pointer)
Expand Down
5 changes: 3 additions & 2 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kingfisher_scrapy.pipelines.Sample': 200,
'kingfisher_scrapy.pipelines.Validate': 300,
'kingfisher_scrapy.pipelines.Pluck': 301,
'kingfisher_scrapy.pipelines.Unflatten': 300,
'kingfisher_scrapy.pipelines.Validate': 301,
'kingfisher_scrapy.pipelines.Pluck': 302,
}


Expand Down
42 changes: 42 additions & 0 deletions kingfisher_scrapy/spiders/bolivia_agetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, components


class BoliviaAgetic(SimpleSpider):
"""
Domain
Agencia de Gobierno Electrónico y Tecnologías de Información y Comunicación (AGETIC)
Spider arguments
sample
Downloads the first file in the downloads page.
Bulk download documentation
https://datos.gob.bo/id/dataset/contrataciones-agetic-2019-estandar-ocp
"""
name = 'bolivia_agetic'
data_type = 'release_list'
unflatten = True

def start_requests(self):
# A CKAN API JSON response.
url = 'https://datos.gob.bo/api/3/action/package_show?id=contrataciones-agetic-2019-estandar-ocp'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
data = json.loads(response.text)
for resource in data['result']['resources']:
if 'ocds' in resource['description']:
# Presently, only one URL matches.
yield scrapy.Request(
resource['url'],
meta={'file_name': components(-1)(resource['url'])},
callback=self.parse_data
)

@handle_http_error
def parse_data(self, response):
yield self.build_file_from_response(response, data_type=self.data_type)
2 changes: 2 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
flattentool
ijson>=3.1.1
jsonpointer
jsonref
jsonschema
ocdsmerge
rarfile
requests
rfc3987
Expand Down
16 changes: 13 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,25 @@ chardet==3.0.4 # via requests
constantly==15.1.0 # via twisted
cryptography==3.2.1 # via pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via parsel, scrapy
defusedxml==0.6.0 # via odfpy
et-xmlfile==1.0.1 # via openpyxl
flattentool==0.14.0 # via -r requirements.in
hyperlink==19.0.0 # via twisted
idna==2.8 # via hyperlink, requests
ijson==3.1.1 # via -r requirements.in
importlib-metadata==1.6.1 # via jsonschema
incremental==17.5.0 # via twisted
itemadapter==0.1.1 # via itemloaders, scrapy
itemloaders==1.0.3 # via scrapy
jdcal==1.4.1 # via openpyxl
jmespath==0.10.0 # via itemloaders
jsonpointer==2.0 # via -r requirements.in
jsonref==0.2 # via -r requirements.in
jsonref==0.2 # via -r requirements.in, flattentool, ocdsmerge
jsonschema==3.2.0 # via -r requirements.in
lxml==4.4.2 # via parsel, scrapy
lxml==4.4.2 # via flattentool, parsel, scrapy
ocdsmerge==0.6.4 # via -r requirements.in
odfpy==1.4.1 # via flattentool
openpyxl==3.0.5 # via flattentool
parsel==1.5.2 # via itemloaders, scrapy
protego==0.1.16 # via scrapy
pyasn1-modules==0.2.7 # via service-identity
Expand All @@ -33,10 +40,12 @@ pydispatcher==2.0.5 # via scrapy
pyhamcrest==1.9.0 # via twisted
pyopenssl==19.1.0 # via scrapy
pyrsistent==0.16.0 # via jsonschema
pytz==2020.1 # via flattentool
queuelib==1.5.0 # via scrapy
rarfile==3.1 # via -r requirements.in
requests==2.22.0 # via -r requirements.in
requests==2.22.0 # via -r requirements.in, ocdsmerge
rfc3987==1.3.8 # via -r requirements.in
schema==0.7.2 # via flattentool
scrapy==2.3.0 # via -r requirements.in, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.in
scrapyd==1.2.1 # via -r requirements.in
Expand All @@ -46,6 +55,7 @@ six==1.13.0 # via automat, cryptography, jsonschema, parsel, prote
twisted==20.3.0 # via scrapy, scrapyd
urllib3==1.25.7 # via requests, sentry-sdk
w3lib==1.21.0 # via itemloaders, parsel, scrapy
xmltodict==0.12.0 # via flattentool
zipp==3.1.0 # via importlib-metadata
zope.interface==4.7.1 # via scrapy, twisted

Expand Down
17 changes: 14 additions & 3 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ cffi==1.13.2 # via -r requirements.txt, cryptography
chardet==3.0.4 # via -r requirements.txt, requests
click==7.1.2 # via pip-tools
constantly==15.1.0 # via -r requirements.txt, twisted
contextlib2==0.6.0.post1 # via -r requirements.txt, schema
coverage==5.0.3 # via coveralls, pytest-cov
coveralls==2.0.0 # via -r requirements_dev.in
cryptography==3.2.1 # via -r requirements.txt, pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via -r requirements.txt, parsel, scrapy
defusedxml==0.6.0 # via -r requirements.txt, odfpy
docopt==0.6.2 # via coveralls
entrypoints==0.3 # via flake8
et-xmlfile==1.0.1 # via -r requirements.txt, openpyxl
flake8==3.7.9 # via -r requirements_dev.in
flattentool==0.14.0 # via -r requirements.txt
hyperlink==19.0.0 # via -r requirements.txt, twisted
idna==2.8 # via -r requirements.txt, hyperlink, requests
ijson==3.1.1 # via -r requirements.txt
Expand All @@ -26,13 +30,17 @@ incremental==17.5.0 # via -r requirements.txt, twisted
isort==4.3.21 # via -r requirements_dev.in
itemadapter==0.1.1 # via -r requirements.txt, itemloaders, scrapy
itemloaders==1.0.3 # via -r requirements.txt, scrapy
jdcal==1.4.1 # via -r requirements.txt, openpyxl
jmespath==0.10.0 # via -r requirements.txt, itemloaders
jsonpointer==2.0 # via -r requirements.txt
jsonref==0.2 # via -r requirements.txt
jsonref==0.2 # via -r requirements.txt, flattentool, ocdsmerge
jsonschema==3.2.0 # via -r requirements.txt
lxml==4.4.2 # via -r requirements.txt, parsel, scrapy
lxml==4.4.2 # via -r requirements.txt, flattentool, parsel, scrapy
mccabe==0.6.1 # via flake8
more-itertools==8.0.2 # via pytest
ocdsmerge==0.6.4 # via -r requirements.txt
odfpy==1.4.1 # via -r requirements.txt, flattentool
openpyxl==3.0.5 # via -r requirements.txt, flattentool
packaging==19.2 # via pytest
parsel==1.5.2 # via -r requirements.txt, itemloaders, scrapy
pip-tools==5.1.0 # via -r requirements_dev.in
Expand All @@ -51,10 +59,12 @@ pyparsing==2.4.5 # via packaging
pyrsistent==0.16.0 # via -r requirements.txt, jsonschema
pytest-cov==2.8.1 # via -r requirements_dev.in
pytest==5.3.2 # via -r requirements_dev.in, pytest-cov
pytz==2020.1 # via -r requirements.txt, flattentool
queuelib==1.5.0 # via -r requirements.txt, scrapy
rarfile==3.1 # via -r requirements.txt
requests==2.22.0 # via -r requirements.txt, coveralls
requests==2.22.0 # via -r requirements.txt, coveralls, ocdsmerge
rfc3987==1.3.8 # via -r requirements.txt
schema==0.7.2 # via -r requirements.txt, flattentool
scrapy==2.3.0 # via -r requirements.txt, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.txt
scrapyd==1.2.1 # via -r requirements.txt
Expand All @@ -65,6 +75,7 @@ twisted==20.3.0 # via -r requirements.txt, scrapy, scrapyd
urllib3==1.25.7 # via -r requirements.txt, requests, sentry-sdk
w3lib==1.21.0 # via -r requirements.txt, itemloaders, parsel, scrapy
wcwidth==0.1.7 # via pytest
xmltodict==0.12.0 # via -r requirements.txt, flattentool
zipp==3.1.0 # via -r requirements.txt, importlib-metadata
zope.interface==4.7.1 # via -r requirements.txt, scrapy, twisted

Expand Down
4 changes: 2 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from kingfisher_scrapy.base_spider import BaseSpider


def response_fixture(meta=None, **kwargs):
def response_fixture(meta=None, url_path='', **kwargs):
if meta is None:
meta = {'file_name': 'test'}
request = Request('http://example.com', meta=meta)
request = Request(f'http://example.com{url_path}', meta=meta)
kwargs.setdefault('status', 200)
return TextResponse(request.url, encoding='utf-8', request=request, **kwargs)

Expand Down
63 changes: 63 additions & 0 deletions tests/pipelines/test_unflatten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from openpyxl import Workbook
from openpyxl.writer.excel import save_virtual_workbook

import pytest
from flattentool.input import BadXLSXZipFile

from kingfisher_scrapy.items import File
from kingfisher_scrapy.pipelines import Unflatten
from tests import spider_with_crawler


def test_process_item_csv():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'test.csv',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/test.csv',
})

assert pipeline.process_item(item, spider) == item


def test_process_item_xlsx():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'test.xlsx',
'data': save_virtual_workbook(Workbook()),
'data_type': 'release_list',
'url': 'http://test.com/test.xlsx',
})

assert pipeline.process_item(item, spider) == item


def test_process_item_extension_error():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'file',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/file',
})

with pytest.raises(NotImplementedError):
pipeline.process_item(item, spider)


def test_process_item_xlsx_error():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'test.xlsx',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/test.xlsx',
})

with pytest.raises(BadXLSXZipFile):
pipeline.process_item(item, spider)

0 comments on commit 8754249

Please sign in to comment.