Merge c9f3f24 into 802d822

open-contracting · Nov 3, 2020 · 94f5144 · 94f5144
2 parents 802d822 + c9f3f24
commit 94f5144
Show file tree

Hide file tree

Showing 12 changed files with 198 additions and 12 deletions.
diff --git a/docs/spiders.rst b/docs/spiders.rst
@@ -176,6 +176,16 @@ Australia
 
    scrapy crawl australia_nsw
 
+Bolivia
+-------
+
+.. autoclass:: kingfisher_scrapy.spiders.bolivia_agetic.BoliviaAgetic
+   :no-members:
+
+.. code-block:: bash
+
+   scrapy crawl bolivia_agetic
+
 Canada
 ------
 

diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -28,7 +28,8 @@ class BaseSpider(scrapy.Spider):
     -  If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
        ``default_from_date`` class attribute to a date string.
     -  If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
-
+    -  If a spider collect data from CSV or XLSX files, add a ``unflatten = True`` class attribute to process each item
+       in the Unflatten pipeline class using the  ``unflatten`` command from Flatten Tool.
     If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
     ``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
     ``get_default_until_date()`` return value (which is the current time, by default).
@@ -39,6 +40,7 @@ class BaseSpider(scrapy.Spider):
     ocds_version = '1.1'
     date_format = 'date'
     date_required = False
+    unflatten = False
 
     def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
                  keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,

diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
@@ -1,14 +1,20 @@
 # https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
 import json
+import os
 import pkgutil
+import tempfile
+from urllib.parse import urlsplit
 
 import jsonpointer
+from flattentool import unflatten
 from jsonschema import FormatChecker
 from jsonschema.validators import Draft4Validator, RefResolver
+from ocdsmerge.util import get_release_schema_url, get_tags
 from scrapy.exceptions import DropItem
 
 from kingfisher_scrapy.items import File, FileItem, PluckedItem
+from kingfisher_scrapy.util import components
 
 
 def _json_loads(basename):
@@ -106,6 +112,55 @@ def process_item(self, item, spider):
         return PluckedItem({'value': value})
 
 
+class Unflatten:
+    def process_item(self, item, spider):
+        if not spider.unflatten:
+            return item
+
+        if not item['file_name']:
+            item['file_name'] = urlsplit(item['url']).path.rsplit('/', 1)[-1]
+
+        if item['file_name'].endswith('.csv'):
+            input_format = 'csv'
+        elif item['file_name'].endswith('.xlsx'):
+            input_format = 'xlsx'
+        else:
+            raise NotImplementedError(f"the file '{item['file_name']}' has no extension or is not CSV or XLSX, "
+                                      f"obteined from: {item['url']}")
+
+        with tempfile.TemporaryDirectory() as directory:
+            file_path = os.path.join(directory, item['file_name'])
+            with open(file_path, 'wb') as f:
+                f.write(item['data'])
+
+            if input_format == 'csv':
+                input_name = directory
+            elif input_format == 'xlsx':
+                input_name = file_path
+
+            tags = get_tags()
+            for i in range(1, len(tags)):
+                tag = tags[-i].replace('__', '.')
+                if spider.ocds_version in tag:
+                    schema = get_release_schema_url(tags[-i])
+                    break
+
+            unflatten(
+                input_name,
+                root_list_path='releases',
+                root_id='ocid',
+                schema=schema,
+                input_format=input_format,
+                output_name=file_path
+            )
+
+            with open(file_path, 'r') as f:
+                item['data'] = f.read()
+                item['file_name'] = components(-1)(item['file_name']) + '.json'
+
+                return item
+
+
 def _resolve_pointer(data, pointer):
     try:
         return jsonpointer.resolve_pointer(data, pointer)

diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py
@@ -82,8 +82,9 @@
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'kingfisher_scrapy.pipelines.Sample': 200,
-   'kingfisher_scrapy.pipelines.Validate': 300,
-   'kingfisher_scrapy.pipelines.Pluck': 301,
+   'kingfisher_scrapy.pipelines.Unflatten': 300,
+   'kingfisher_scrapy.pipelines.Validate': 301,
+   'kingfisher_scrapy.pipelines.Pluck': 302,
 }
 
 

diff --git a/kingfisher_scrapy/spiders/bolivia_agetic.py b/kingfisher_scrapy/spiders/bolivia_agetic.py
@@ -0,0 +1,38 @@
+import json
+
+import scrapy
+
+from kingfisher_scrapy.base_spider import SimpleSpider
+from kingfisher_scrapy.util import handle_http_error
+
+
+class BoliviaAgetic(SimpleSpider):
+    """
+    Domain
+      Agencia de Gobierno Electrónico y Tecnologías de Información y Comunicación (AGETIC)
+    Spider arguments
+      sample
+        Downloads the first file in the downloads page.
+    Bulk download documentation
+      https://datos.gob.bo/id/dataset/contrataciones-agetic-2019-estandar-ocp
+    """
+    name = 'bolivia_agetic'
+    data_type = 'release_list'
+    unflatten = True
+
+    def start_requests(self):
+        # A CKAN API JSON response.
+        url = 'https://datos.gob.bo/api/3/action/package_show?id=contrataciones-agetic-2019-estandar-ocp'
+        yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
+
+    @handle_http_error
+    def parse_list(self, response):
+        data = json.loads(response.text)
+        for resource in data['result']['resources']:
+            if 'ocds' in resource['description']:
+                # Presently, only one URL matches.
+                yield scrapy.Request(resource['url'], meta={'file_name': resource['url']}, callback=self.parse_data)
+
+    @handle_http_error
+    def parse_data(self, response):
+        yield self.build_file(url=response.request.url, data_type=self.data_type, data=response.body)
diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py
@@ -19,7 +19,8 @@ def _pluck_filename(opts):
 
 def components(start, stop=None):
     """
-    Returns a function that returns the selected non-empty path components, excluding the ``.json`` extension.
+    Returns a function that returns the selected non-empty path components, excluding the ``.json``, ``.csv`` or
+    ``.xlsx`` extension.
 
     >>> components(-1)('http://example.com/api/planning.json')
     'planning'
@@ -29,8 +30,10 @@ def components(start, stop=None):
     """
     def wrapper(url):
         value = '-'.join(list(filter(None, urlsplit(url).path.split('/')))[start:stop])
-        if value.endswith('.json'):
+        if value.endswith(('.json', '.xlsx')):
             return value[:-5]
+        if value.endswith('.csv'):
+            return value[:-4]
         return value
     return wrapper
 

diff --git a/requirements.in b/requirements.in
@@ -1,3 +1,4 @@
+flattentool
 ijson>=3.1.1
 jsonpointer
 jsonref

diff --git a/requirements.txt b/requirements.txt
@@ -10,20 +10,27 @@ certifi==2019.11.28       # via requests, sentry-sdk
 cffi==1.13.2              # via cryptography
 chardet==3.0.4            # via requests
 constantly==15.1.0        # via twisted
+contextlib2==0.6.0.post1  # via schema
 cryptography==2.8         # via pyopenssl, scrapy, service-identity
 cssselect==1.1.0          # via parsel, scrapy
+defusedxml==0.6.0         # via odfpy
+et-xmlfile==1.0.1         # via openpyxl
+flattentool==0.14.0       # via -r requirements.in
 hyperlink==19.0.0         # via twisted
 idna==2.8                 # via hyperlink, requests
 ijson==3.1.1              # via -r requirements.in
 importlib-metadata==1.6.1  # via jsonschema
 incremental==17.5.0       # via twisted
 itemadapter==0.1.1        # via itemloaders, scrapy
 itemloaders==1.0.3        # via scrapy
+jdcal==1.4.1              # via openpyxl
 jmespath==0.10.0          # via itemloaders
 jsonpointer==2.0          # via -r requirements.in
-jsonref==0.2              # via -r requirements.in
+jsonref==0.2              # via -r requirements.in, flattentool
 jsonschema==3.2.0         # via -r requirements.in
-lxml==4.4.2               # via parsel, scrapy
+lxml==4.4.2               # via flattentool, parsel, scrapy
+odfpy==1.4.1              # via flattentool
+openpyxl==3.0.5           # via flattentool
 parsel==1.5.2             # via itemloaders, scrapy
 protego==0.1.16           # via scrapy
 pyasn1-modules==0.2.7     # via service-identity
@@ -33,10 +40,12 @@ pydispatcher==2.0.5       # via scrapy
 pyhamcrest==1.9.0         # via twisted
 pyopenssl==19.1.0         # via scrapy
 pyrsistent==0.16.0        # via jsonschema
+pytz==2020.1              # via flattentool
 queuelib==1.5.0           # via scrapy
 rarfile==3.1              # via -r requirements.in
 requests==2.22.0          # via -r requirements.in
 rfc3987==1.3.8            # via -r requirements.in
+schema==0.7.2             # via flattentool
 scrapy==2.3.0             # via -r requirements.in, scrapyd, scrapyd-client
 scrapyd-client==1.1.0     # via -r requirements.in
 scrapyd==1.2.1            # via -r requirements.in
@@ -46,6 +55,7 @@ six==1.13.0               # via automat, cryptography, jsonschema, parsel, prote
 twisted==20.3.0           # via scrapy, scrapyd
 urllib3==1.25.7           # via requests, sentry-sdk
 w3lib==1.21.0             # via itemloaders, parsel, scrapy
+xmltodict==0.12.0         # via flattentool
 zipp==3.1.0               # via importlib-metadata
 zope.interface==4.7.1     # via scrapy, twisted
 

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -11,13 +11,17 @@ cffi==1.13.2              # via -r requirements.txt, cryptography
 chardet==3.0.4            # via -r requirements.txt, requests
 click==7.1.2              # via pip-tools
 constantly==15.1.0        # via -r requirements.txt, twisted
+contextlib2==0.6.0.post1  # via -r requirements.txt, schema
 coverage==5.0.3           # via coveralls, pytest-cov
 coveralls==2.0.0          # via -r requirements_dev.in
 cryptography==2.8         # via -r requirements.txt, pyopenssl, scrapy, service-identity
 cssselect==1.1.0          # via -r requirements.txt, parsel, scrapy
+defusedxml==0.6.0         # via -r requirements.txt, odfpy
 docopt==0.6.2             # via coveralls
 entrypoints==0.3          # via flake8
+et-xmlfile==1.0.1         # via -r requirements.txt, openpyxl
 flake8==3.7.9             # via -r requirements_dev.in
+flattentool==0.14.0       # via -r requirements.txt
 hyperlink==19.0.0         # via -r requirements.txt, twisted
 idna==2.8                 # via -r requirements.txt, hyperlink, requests
 ijson==3.1.1              # via -r requirements.txt
@@ -26,13 +30,16 @@ incremental==17.5.0       # via -r requirements.txt, twisted
 isort==4.3.21             # via -r requirements_dev.in
 itemadapter==0.1.1        # via -r requirements.txt, itemloaders, scrapy
 itemloaders==1.0.3        # via -r requirements.txt, scrapy
+jdcal==1.4.1              # via -r requirements.txt, openpyxl
 jmespath==0.10.0          # via -r requirements.txt, itemloaders
 jsonpointer==2.0          # via -r requirements.txt
-jsonref==0.2              # via -r requirements.txt
+jsonref==0.2              # via -r requirements.txt, flattentool
 jsonschema==3.2.0         # via -r requirements.txt
-lxml==4.4.2               # via -r requirements.txt, parsel, scrapy
+lxml==4.4.2               # via -r requirements.txt, flattentool, parsel, scrapy
 mccabe==0.6.1             # via flake8
 more-itertools==8.0.2     # via pytest
+odfpy==1.4.1              # via -r requirements.txt, flattentool
+openpyxl==3.0.5           # via -r requirements.txt, flattentool
 packaging==19.2           # via pytest
 parsel==1.5.2             # via -r requirements.txt, itemloaders, scrapy
 pip-tools==5.1.0          # via -r requirements_dev.in
@@ -51,10 +58,12 @@ pyparsing==2.4.5          # via packaging
 pyrsistent==0.16.0        # via -r requirements.txt, jsonschema
 pytest-cov==2.8.1         # via -r requirements_dev.in
 pytest==5.3.2             # via -r requirements_dev.in, pytest-cov
+pytz==2020.1              # via -r requirements.txt, flattentool
 queuelib==1.5.0           # via -r requirements.txt, scrapy
 rarfile==3.1              # via -r requirements.txt
 requests==2.22.0          # via -r requirements.txt, coveralls
 rfc3987==1.3.8            # via -r requirements.txt
+schema==0.7.2             # via -r requirements.txt, flattentool
 scrapy==2.3.0             # via -r requirements.txt, scrapyd, scrapyd-client
 scrapyd-client==1.1.0     # via -r requirements.txt
 scrapyd==1.2.1            # via -r requirements.txt
@@ -65,6 +74,7 @@ twisted==20.3.0           # via -r requirements.txt, scrapy, scrapyd
 urllib3==1.25.7           # via -r requirements.txt, requests, sentry-sdk
 w3lib==1.21.0             # via -r requirements.txt, itemloaders, parsel, scrapy
 wcwidth==0.1.7            # via pytest
+xmltodict==0.12.0         # via -r requirements.txt, flattentool
 zipp==3.1.0               # via -r requirements.txt, importlib-metadata
 zope.interface==4.7.1     # via -r requirements.txt, scrapy, twisted
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -7,10 +7,10 @@
 from kingfisher_scrapy.base_spider import BaseSpider
 
 
-def response_fixture(meta=None, **kwargs):
+def response_fixture(meta=None, url_path='', **kwargs):
     if meta is None:
         meta = {'file_name': 'test'}
-    request = Request('http://example.com', meta=meta)
+    request = Request('http://example.com' + url_path, meta=meta)
     kwargs.setdefault('status', 200)
     kwargs.setdefault('body', b'{"links": {"next": "http://example.com/next"}}')
     return TextResponse(request.url, encoding='utf-8', request=request, **kwargs)

diff --git a/tests/pipelines/test_unflatten.py b/tests/pipelines/test_unflatten.py
@@ -0,0 +1,47 @@
+import pytest
+from flattentool.input import BadXLSXZipFile
+
+from kingfisher_scrapy.items import File
+from kingfisher_scrapy.pipelines import Unflatten
+from tests import spider_with_crawler
+
+
+def test_process_item():
+    spider = spider_with_crawler(unflatten=True)
+    pipeline = Unflatten()
+    item = File({
+        'file_name': 'test.csv',
+        'data': b'data',
+        'data_type': 'release_list',
+        'url': 'http://test.com/test.csv',
+    })
+
+    assert pipeline.process_item(item, spider) == item
+
+
+def test_process_item_error():
+    spider = spider_with_crawler(unflatten=True)
+    pipeline = Unflatten()
+    item = File({
+        'file_name': 'file',
+        'data': b'data',
+        'data_type': 'release_list',
+        'url': 'http://test.com/file',
+    })
+
+    with pytest.raises(NotImplementedError):
+        pipeline.process_item(item, spider)
+
+
+def test_process_item_xlsx_error():
+    spider = spider_with_crawler(unflatten=True)
+    pipeline = Unflatten()
+    item = File({
+        'file_name': 'test.xlsx',
+        'data': b'data',
+        'data_type': 'release_list',
+        'url': 'http://test.com/test.xlsx',
+    })
+
+    with pytest.raises(BadXLSXZipFile):
+        pipeline.process_item(item, spider)
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -1,6 +1,6 @@
 import pytest
 
-from kingfisher_scrapy.util import get_parameter_value, replace_parameters
+from kingfisher_scrapy.util import components, get_parameter_value, replace_parameters
 
 
 @pytest.mark.parametrize('url,value,expected', [
@@ -20,3 +20,12 @@ def test_replace_parameters(url, value, expected):
 ])
 def test_get_parameter_value(url, expected):
     assert get_parameter_value(url, 'page') == expected
+
+
+@pytest.mark.parametrize('url,expected', [
+    ('http://example.com/example/file.json', 'file'),
+    ('http://example.com/example/file.xlsx', 'file'),
+    ('http://example.com/example/file.csv', 'file'),
+])
+def test_components(url, expected):
+    assert components(-1)(url) == expected