base_spider: Rename compressed_file_format string to resize_package b…

…oolean
open-contracting · Jan 31, 2021 · 4d8daeb · 4d8daeb
1 parent a3692d0
commit 4d8daeb
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 26 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -26,7 +26,7 @@ class BaseSpider(scrapy.Spider):
     -  If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
        ``default_from_date`` class attribute to a date string.
     -  If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
-    -  If a spider collects data as CSV or XLSX files, set the class attribute ``unflatten = True`` to convert each
+    -  If a spider collects data as CSV or XLSX files, add a ``unflatten = True`` class attribute to convert each
        item to json files in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
        If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them.
     -  If the data is not formatted as OCDS (record, release, record package or release package), set the ``root_path``
@@ -279,13 +279,7 @@ class CompressedFileSpider(BaseSpider):
     #. Inherit from ``CompressedFileSpider``
     #. Set a ``data_type`` class attribute to the data type of the compressed files
     #. Optionally, set an ``encoding`` class attribute to the encoding of the compressed files (default UTF-8)
-    #. Optionally, set a ``compressed_file_format`` class attribute to the format of the compressed files
-
-       ``release_package``
-         Re-packages the releases in the compressed files in groups of 100, and yields the packages.
-       ``None``
-         Yields each compressed file.
-
+    #. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
     #. Write a ``start_requests`` method to request the archive files
 
     .. code-block:: python
@@ -302,7 +296,7 @@ def start_requests(self):
     """
 
     encoding = 'utf-8'
-    compressed_file_format = None
+    resize_package = False
     file_name_must_contain = ''
 
     @handle_http_error
@@ -331,9 +325,9 @@ def parse(self, response):
                 basename += '.json'
 
             compressed_file = archive_file.open(filename)
-            # If `compressed_file_format` is 'release_package', we need to open the file twice: once to extract the
-            # package metadata and then to extract the releases themselves.
-            if self.compressed_file_format == 'release_package':
+            # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and
+            # then to extract the releases themselves.
+            if self.resize_package:
                 data = {'data': compressed_file, 'package': archive_file.open(filename)}
             else:
                 data = compressed_file

diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py
@@ -190,30 +190,28 @@ def process_spider_output(self, response, result, spider):
 
 class ResizePackageMiddleware:
     """
-    If the spider's ``compressed_file_format`` class attribute is "release_package", splits the package into multiple
-    packages. Otherwise, yields the original item.
+    If the spider's ``resize_package`` class attribute is ``True``, splits the package into multiple packages.
+    Otherwise, yields the original item.
     """
     def process_spider_output(self, response, result, spider):
         for item in result:
-            if not isinstance(item, File) or getattr(spider, 'compressed_file_format', None) != 'release_package':
+            if not isinstance(item, File) or not getattr(spider, 'resize_package', False):
                 yield item
                 continue
 
-            list_data = item['data']['data']
-            package_data = item['data']['package']
             data_type = item['data_type']
-            max_releases_per_package = 100
             if spider.sample:
                 size = spider.sample
             else:
-                size = max_releases_per_package
+                size = 100
 
-            package = self._get_package_metadata(package_data, 'releases', data_type)
+            package = self._get_package_metadata(item['data']['package'], 'releases', data_type)
             # We yield a release o record package with a maximum of self.max_releases_per_package releases or records
-            for number, items in enumerate(util.grouper(ijson.items(list_data, 'releases.item'), size), 1):
+            for number, items in enumerate(util.grouper(ijson.items(item['data']['data'], 'releases.item'), size), 1):
                 # Avoid reading the rest of a large file, since the rest of the items will be dropped.
                 if spider.sample and number > spider.sample:
                     return
+
                 package['releases'] = filter(None, items)
                 data = json.dumps(package, default=util.default)
 

diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py
@@ -13,10 +13,13 @@ class ArgentinaBuenosAires(CompressedFileSpider):
     Bulk download documentation
       https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc
     """
-
     name = 'argentina_buenos_aires'
+
+    # SimpleSpider
     data_type = 'release_package'
-    compressed_file_format = 'release_package'
+
+    # CompressedFileSpider
+    resize_package = True
 
     # the data list service takes too long to be downloaded, so we increase the download timeout
     download_timeout = 1000

diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -12,8 +12,12 @@ class DominicanRepublic(CompressedFileSpider):
       https://www.dgcp.gob.do/estandar-mundial-ocds/
     """
     name = 'dominican_republic'
+
+    # SimpleSpider
     data_type = 'release_package'
-    compressed_file_format = 'release_package'
+
+    # CompressedFileSpider
+    resize_package = True
 
     def start_requests(self):
         yield scrapy.Request(

diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py
@@ -97,7 +97,7 @@ def test_data_types(data_type, data, root_path):
 def test_parse_release_package(sample, len_releases):
     spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
     spider.data_type = 'release_package'
-    spider.compressed_file_format = 'release_package'
+    spider.resize_package = True
 
     middleware = ResizePackageMiddleware()
 

diff --git a/tests/test_compressed_file_spider.py b/tests/test_compressed_file_spider.py
@@ -70,7 +70,7 @@ def test_parse_line_delimited(sample, len_items):
 def test_parse_release_package(sample, len_items, len_releases):
     spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
     spider.data_type = 'release_package'
-    spider.compressed_file_format = 'release_package'
+    spider.resize_package = True
 
     package = {'releases': []}
     for i in range(200):