Skip to content

Commit

Permalink
Merge c9f3f24 into 802d822
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Nov 3, 2020
2 parents 802d822 + c9f3f24 commit 94f5144
Show file tree
Hide file tree
Showing 12 changed files with 198 additions and 12 deletions.
10 changes: 10 additions & 0 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,16 @@ Australia
scrapy crawl australia_nsw
Bolivia
-------

.. autoclass:: kingfisher_scrapy.spiders.bolivia_agetic.BoliviaAgetic
:no-members:

.. code-block:: bash
scrapy crawl bolivia_agetic
Canada
------

Expand Down
4 changes: 3 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class BaseSpider(scrapy.Spider):
- If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
``default_from_date`` class attribute to a date string.
- If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
- If a spider collect data from CSV or XLSX files, add a ``unflatten = True`` class attribute to process each item
in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
``get_default_until_date()`` return value (which is the current time, by default).
Expand All @@ -39,6 +40,7 @@ class BaseSpider(scrapy.Spider):
ocds_version = '1.1'
date_format = 'date'
date_required = False
unflatten = False

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
Expand Down
55 changes: 55 additions & 0 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
import json
import os
import pkgutil
import tempfile
from urllib.parse import urlsplit

import jsonpointer
from flattentool import unflatten
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator, RefResolver
from ocdsmerge.util import get_release_schema_url, get_tags
from scrapy.exceptions import DropItem

from kingfisher_scrapy.items import File, FileItem, PluckedItem
from kingfisher_scrapy.util import components


def _json_loads(basename):
Expand Down Expand Up @@ -106,6 +112,55 @@ def process_item(self, item, spider):
return PluckedItem({'value': value})


class Unflatten:
def process_item(self, item, spider):
if not spider.unflatten:
return item

if not item['file_name']:
item['file_name'] = urlsplit(item['url']).path.rsplit('/', 1)[-1]

if item['file_name'].endswith('.csv'):
input_format = 'csv'
elif item['file_name'].endswith('.xlsx'):
input_format = 'xlsx'
else:
raise NotImplementedError(f"the file '{item['file_name']}' has no extension or is not CSV or XLSX, "
f"obteined from: {item['url']}")

with tempfile.TemporaryDirectory() as directory:
file_path = os.path.join(directory, item['file_name'])
with open(file_path, 'wb') as f:
f.write(item['data'])

if input_format == 'csv':
input_name = directory
elif input_format == 'xlsx':
input_name = file_path

tags = get_tags()
for i in range(1, len(tags)):
tag = tags[-i].replace('__', '.')
if spider.ocds_version in tag:
schema = get_release_schema_url(tags[-i])
break

unflatten(
input_name,
root_list_path='releases',
root_id='ocid',
schema=schema,
input_format=input_format,
output_name=file_path
)

with open(file_path, 'r') as f:
item['data'] = f.read()
item['file_name'] = components(-1)(item['file_name']) + '.json'

return item


def _resolve_pointer(data, pointer):
try:
return jsonpointer.resolve_pointer(data, pointer)
Expand Down
5 changes: 3 additions & 2 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kingfisher_scrapy.pipelines.Sample': 200,
'kingfisher_scrapy.pipelines.Validate': 300,
'kingfisher_scrapy.pipelines.Pluck': 301,
'kingfisher_scrapy.pipelines.Unflatten': 300,
'kingfisher_scrapy.pipelines.Validate': 301,
'kingfisher_scrapy.pipelines.Pluck': 302,
}


Expand Down
38 changes: 38 additions & 0 deletions kingfisher_scrapy/spiders/bolivia_agetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error


class BoliviaAgetic(SimpleSpider):
"""
Domain
Agencia de Gobierno Electrónico y Tecnologías de Información y Comunicación (AGETIC)
Spider arguments
sample
Downloads the first file in the downloads page.
Bulk download documentation
https://datos.gob.bo/id/dataset/contrataciones-agetic-2019-estandar-ocp
"""
name = 'bolivia_agetic'
data_type = 'release_list'
unflatten = True

def start_requests(self):
# A CKAN API JSON response.
url = 'https://datos.gob.bo/api/3/action/package_show?id=contrataciones-agetic-2019-estandar-ocp'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
data = json.loads(response.text)
for resource in data['result']['resources']:
if 'ocds' in resource['description']:
# Presently, only one URL matches.
yield scrapy.Request(resource['url'], meta={'file_name': resource['url']}, callback=self.parse_data)

@handle_http_error
def parse_data(self, response):
yield self.build_file(url=response.request.url, data_type=self.data_type, data=response.body)
7 changes: 5 additions & 2 deletions kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def _pluck_filename(opts):

def components(start, stop=None):
"""
Returns a function that returns the selected non-empty path components, excluding the ``.json`` extension.
Returns a function that returns the selected non-empty path components, excluding the ``.json``, ``.csv`` or
``.xlsx`` extension.
>>> components(-1)('http://example.com/api/planning.json')
'planning'
Expand All @@ -29,8 +30,10 @@ def components(start, stop=None):
"""
def wrapper(url):
value = '-'.join(list(filter(None, urlsplit(url).path.split('/')))[start:stop])
if value.endswith('.json'):
if value.endswith(('.json', '.xlsx')):
return value[:-5]
if value.endswith('.csv'):
return value[:-4]
return value
return wrapper

Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
flattentool
ijson>=3.1.1
jsonpointer
jsonref
Expand Down
14 changes: 12 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,27 @@ certifi==2019.11.28 # via requests, sentry-sdk
cffi==1.13.2 # via cryptography
chardet==3.0.4 # via requests
constantly==15.1.0 # via twisted
contextlib2==0.6.0.post1 # via schema
cryptography==2.8 # via pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via parsel, scrapy
defusedxml==0.6.0 # via odfpy
et-xmlfile==1.0.1 # via openpyxl
flattentool==0.14.0 # via -r requirements.in
hyperlink==19.0.0 # via twisted
idna==2.8 # via hyperlink, requests
ijson==3.1.1 # via -r requirements.in
importlib-metadata==1.6.1 # via jsonschema
incremental==17.5.0 # via twisted
itemadapter==0.1.1 # via itemloaders, scrapy
itemloaders==1.0.3 # via scrapy
jdcal==1.4.1 # via openpyxl
jmespath==0.10.0 # via itemloaders
jsonpointer==2.0 # via -r requirements.in
jsonref==0.2 # via -r requirements.in
jsonref==0.2 # via -r requirements.in, flattentool
jsonschema==3.2.0 # via -r requirements.in
lxml==4.4.2 # via parsel, scrapy
lxml==4.4.2 # via flattentool, parsel, scrapy
odfpy==1.4.1 # via flattentool
openpyxl==3.0.5 # via flattentool
parsel==1.5.2 # via itemloaders, scrapy
protego==0.1.16 # via scrapy
pyasn1-modules==0.2.7 # via service-identity
Expand All @@ -33,10 +40,12 @@ pydispatcher==2.0.5 # via scrapy
pyhamcrest==1.9.0 # via twisted
pyopenssl==19.1.0 # via scrapy
pyrsistent==0.16.0 # via jsonschema
pytz==2020.1 # via flattentool
queuelib==1.5.0 # via scrapy
rarfile==3.1 # via -r requirements.in
requests==2.22.0 # via -r requirements.in
rfc3987==1.3.8 # via -r requirements.in
schema==0.7.2 # via flattentool
scrapy==2.3.0 # via -r requirements.in, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.in
scrapyd==1.2.1 # via -r requirements.in
Expand All @@ -46,6 +55,7 @@ six==1.13.0 # via automat, cryptography, jsonschema, parsel, prote
twisted==20.3.0 # via scrapy, scrapyd
urllib3==1.25.7 # via requests, sentry-sdk
w3lib==1.21.0 # via itemloaders, parsel, scrapy
xmltodict==0.12.0 # via flattentool
zipp==3.1.0 # via importlib-metadata
zope.interface==4.7.1 # via scrapy, twisted

Expand Down
14 changes: 12 additions & 2 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ cffi==1.13.2 # via -r requirements.txt, cryptography
chardet==3.0.4 # via -r requirements.txt, requests
click==7.1.2 # via pip-tools
constantly==15.1.0 # via -r requirements.txt, twisted
contextlib2==0.6.0.post1 # via -r requirements.txt, schema
coverage==5.0.3 # via coveralls, pytest-cov
coveralls==2.0.0 # via -r requirements_dev.in
cryptography==2.8 # via -r requirements.txt, pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via -r requirements.txt, parsel, scrapy
defusedxml==0.6.0 # via -r requirements.txt, odfpy
docopt==0.6.2 # via coveralls
entrypoints==0.3 # via flake8
et-xmlfile==1.0.1 # via -r requirements.txt, openpyxl
flake8==3.7.9 # via -r requirements_dev.in
flattentool==0.14.0 # via -r requirements.txt
hyperlink==19.0.0 # via -r requirements.txt, twisted
idna==2.8 # via -r requirements.txt, hyperlink, requests
ijson==3.1.1 # via -r requirements.txt
Expand All @@ -26,13 +30,16 @@ incremental==17.5.0 # via -r requirements.txt, twisted
isort==4.3.21 # via -r requirements_dev.in
itemadapter==0.1.1 # via -r requirements.txt, itemloaders, scrapy
itemloaders==1.0.3 # via -r requirements.txt, scrapy
jdcal==1.4.1 # via -r requirements.txt, openpyxl
jmespath==0.10.0 # via -r requirements.txt, itemloaders
jsonpointer==2.0 # via -r requirements.txt
jsonref==0.2 # via -r requirements.txt
jsonref==0.2 # via -r requirements.txt, flattentool
jsonschema==3.2.0 # via -r requirements.txt
lxml==4.4.2 # via -r requirements.txt, parsel, scrapy
lxml==4.4.2 # via -r requirements.txt, flattentool, parsel, scrapy
mccabe==0.6.1 # via flake8
more-itertools==8.0.2 # via pytest
odfpy==1.4.1 # via -r requirements.txt, flattentool
openpyxl==3.0.5 # via -r requirements.txt, flattentool
packaging==19.2 # via pytest
parsel==1.5.2 # via -r requirements.txt, itemloaders, scrapy
pip-tools==5.1.0 # via -r requirements_dev.in
Expand All @@ -51,10 +58,12 @@ pyparsing==2.4.5 # via packaging
pyrsistent==0.16.0 # via -r requirements.txt, jsonschema
pytest-cov==2.8.1 # via -r requirements_dev.in
pytest==5.3.2 # via -r requirements_dev.in, pytest-cov
pytz==2020.1 # via -r requirements.txt, flattentool
queuelib==1.5.0 # via -r requirements.txt, scrapy
rarfile==3.1 # via -r requirements.txt
requests==2.22.0 # via -r requirements.txt, coveralls
rfc3987==1.3.8 # via -r requirements.txt
schema==0.7.2 # via -r requirements.txt, flattentool
scrapy==2.3.0 # via -r requirements.txt, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.txt
scrapyd==1.2.1 # via -r requirements.txt
Expand All @@ -65,6 +74,7 @@ twisted==20.3.0 # via -r requirements.txt, scrapy, scrapyd
urllib3==1.25.7 # via -r requirements.txt, requests, sentry-sdk
w3lib==1.21.0 # via -r requirements.txt, itemloaders, parsel, scrapy
wcwidth==0.1.7 # via pytest
xmltodict==0.12.0 # via -r requirements.txt, flattentool
zipp==3.1.0 # via -r requirements.txt, importlib-metadata
zope.interface==4.7.1 # via -r requirements.txt, scrapy, twisted

Expand Down
4 changes: 2 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from kingfisher_scrapy.base_spider import BaseSpider


def response_fixture(meta=None, **kwargs):
def response_fixture(meta=None, url_path='', **kwargs):
if meta is None:
meta = {'file_name': 'test'}
request = Request('http://example.com', meta=meta)
request = Request('http://example.com' + url_path, meta=meta)
kwargs.setdefault('status', 200)
kwargs.setdefault('body', b'{"links": {"next": "http://example.com/next"}}')
return TextResponse(request.url, encoding='utf-8', request=request, **kwargs)
Expand Down
47 changes: 47 additions & 0 deletions tests/pipelines/test_unflatten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest
from flattentool.input import BadXLSXZipFile

from kingfisher_scrapy.items import File
from kingfisher_scrapy.pipelines import Unflatten
from tests import spider_with_crawler


def test_process_item():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'test.csv',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/test.csv',
})

assert pipeline.process_item(item, spider) == item


def test_process_item_error():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'file',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/file',
})

with pytest.raises(NotImplementedError):
pipeline.process_item(item, spider)


def test_process_item_xlsx_error():
spider = spider_with_crawler(unflatten=True)
pipeline = Unflatten()
item = File({
'file_name': 'test.xlsx',
'data': b'data',
'data_type': 'release_list',
'url': 'http://test.com/test.xlsx',
})

with pytest.raises(BadXLSXZipFile):
pipeline.process_item(item, spider)
11 changes: 10 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from kingfisher_scrapy.util import get_parameter_value, replace_parameters
from kingfisher_scrapy.util import components, get_parameter_value, replace_parameters


@pytest.mark.parametrize('url,value,expected', [
Expand All @@ -20,3 +20,12 @@ def test_replace_parameters(url, value, expected):
])
def test_get_parameter_value(url, expected):
assert get_parameter_value(url, 'page') == expected


@pytest.mark.parametrize('url,expected', [
('http://example.com/example/file.json', 'file'),
('http://example.com/example/file.xlsx', 'file'),
('http://example.com/example/file.csv', 'file'),
])
def test_components(url, expected):
assert components(-1)(url) == expected

0 comments on commit 94f5144

Please sign in to comment.