Skip to content

Commit

Permalink
base_spider: Rename compressed_file_format string to resize_package b…
Browse files Browse the repository at this point in the history
…oolean
  • Loading branch information
jpmckinney committed Jan 31, 2021
1 parent a3692d0 commit 4d8daeb
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 26 deletions.
18 changes: 6 additions & 12 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class BaseSpider(scrapy.Spider):
- If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
``default_from_date`` class attribute to a date string.
- If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
- If a spider collects data as CSV or XLSX files, set the class attribute ``unflatten = True`` to convert each
- If a spider collects data as CSV or XLSX files, add a ``unflatten = True`` class attribute to convert each
item to json files in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them.
- If the data is not formatted as OCDS (record, release, record package or release package), set the ``root_path``
Expand Down Expand Up @@ -279,13 +279,7 @@ class CompressedFileSpider(BaseSpider):
#. Inherit from ``CompressedFileSpider``
#. Set a ``data_type`` class attribute to the data type of the compressed files
#. Optionally, set an ``encoding`` class attribute to the encoding of the compressed files (default UTF-8)
#. Optionally, set a ``compressed_file_format`` class attribute to the format of the compressed files
``release_package``
Re-packages the releases in the compressed files in groups of 100, and yields the packages.
``None``
Yields each compressed file.
#. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
#. Write a ``start_requests`` method to request the archive files
.. code-block:: python
Expand All @@ -302,7 +296,7 @@ def start_requests(self):
"""

encoding = 'utf-8'
compressed_file_format = None
resize_package = False
file_name_must_contain = ''

@handle_http_error
Expand Down Expand Up @@ -331,9 +325,9 @@ def parse(self, response):
basename += '.json'

compressed_file = archive_file.open(filename)
# If `compressed_file_format` is 'release_package', we need to open the file twice: once to extract the
# package metadata and then to extract the releases themselves.
if self.compressed_file_format == 'release_package':
# If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and
# then to extract the releases themselves.
if self.resize_package:
data = {'data': compressed_file, 'package': archive_file.open(filename)}
else:
data = compressed_file
Expand Down
16 changes: 7 additions & 9 deletions kingfisher_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,30 +190,28 @@ def process_spider_output(self, response, result, spider):

class ResizePackageMiddleware:
"""
If the spider's ``compressed_file_format`` class attribute is "release_package", splits the package into multiple
packages. Otherwise, yields the original item.
If the spider's ``resize_package`` class attribute is ``True``, splits the package into multiple packages.
Otherwise, yields the original item.
"""
def process_spider_output(self, response, result, spider):
for item in result:
if not isinstance(item, File) or getattr(spider, 'compressed_file_format', None) != 'release_package':
if not isinstance(item, File) or not getattr(spider, 'resize_package', False):
yield item
continue

list_data = item['data']['data']
package_data = item['data']['package']
data_type = item['data_type']
max_releases_per_package = 100
if spider.sample:
size = spider.sample
else:
size = max_releases_per_package
size = 100

package = self._get_package_metadata(package_data, 'releases', data_type)
package = self._get_package_metadata(item['data']['package'], 'releases', data_type)
# We yield a release o record package with a maximum of self.max_releases_per_package releases or records
for number, items in enumerate(util.grouper(ijson.items(list_data, 'releases.item'), size), 1):
for number, items in enumerate(util.grouper(ijson.items(item['data']['data'], 'releases.item'), size), 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
return

package['releases'] = filter(None, items)
data = json.dumps(package, default=util.default)

Expand Down
7 changes: 5 additions & 2 deletions kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@ class ArgentinaBuenosAires(CompressedFileSpider):
Bulk download documentation
https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
"""

name = 'argentina_buenos_aires'

# SimpleSpider
data_type = 'release_package'
compressed_file_format = 'release_package'

# CompressedFileSpider
resize_package = True

# the data list service takes too long to be downloaded, so we increase the download timeout
download_timeout = 1000
Expand Down
6 changes: 5 additions & 1 deletion kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ class DominicanRepublic(CompressedFileSpider):
https://www.dgcp.gob.do/estandar-mundial-ocds/
"""
name = 'dominican_republic'

# SimpleSpider
data_type = 'release_package'
compressed_file_format = 'release_package'

# CompressedFileSpider
resize_package = True

def start_requests(self):
yield scrapy.Request(
Expand Down
2 changes: 1 addition & 1 deletion tests/middlewares/test_kingfisher_transform_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_data_types(data_type, data, root_path):
def test_parse_release_package(sample, len_releases):
spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
spider.data_type = 'release_package'
spider.compressed_file_format = 'release_package'
spider.resize_package = True

middleware = ResizePackageMiddleware()

Expand Down
2 changes: 1 addition & 1 deletion tests/test_compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_parse_line_delimited(sample, len_items):
def test_parse_release_package(sample, len_items, len_releases):
spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
spider.data_type = 'release_package'
spider.compressed_file_format = 'release_package'
spider.resize_package = True

package = {'releases': []}
for i in range(200):
Expand Down

0 comments on commit 4d8daeb

Please sign in to comment.