From 334bbdc09fbd07970f742c0fbee39ac3aa8b42bc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 23 Apr 2020 11:35:50 -0400 Subject: [PATCH 01/14] [WIP] Split large files with ijson Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 56 ++++++++++++++----- .../spiders/argentina_buenos_aires.py | 30 +++++----- kingfisher_scrapy/spiders/portugal.py | 2 +- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 6598bdb9..c81eba10 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -2,9 +2,11 @@ import json import os from datetime import datetime +from decimal import Decimal from io import BytesIO from zipfile import ZipFile +import ijson import scrapy from kingfisher_scrapy.exceptions import SpiderArgumentError @@ -36,6 +38,7 @@ class KingfisherSpiderMixin: scrapy crawl spider_name -a note='Started by NAME.' """ + def __init__(self, sample=None, note=None, from_date=None, until_date=None, *args, **kwargs): super().__init__(*args, **kwargs) @@ -125,19 +128,43 @@ def _get_crawl_path(self): # https://rhettinger.wordpress.com/2011/05/26/super-considered-super/ class BaseSpider(KingfisherSpiderMixin, scrapy.Spider): + MAX_SAMPLE = 10 + + @staticmethod + def json_dumps(data): + """ + From ocdskit, returns the data as JSON. + """ + def default(obj): + if isinstance(obj, Decimal): + return float(obj) + raise TypeError('%s is not JSON serializable' % repr(obj)) + + return json.dumps(data, default=default) + + @staticmethod + def _parse_json_item(number, line, data_type, url, encoding): + yield { + 'success': True, + 'number': number, + 'file_name': 'data.json', + 'data': line, + 'data_type': data_type, + 'url': url, + 'encoding': encoding, + } + def parse_json_lines(self, f, data_type, url, encoding='utf-8'): for number, line in enumerate(f, 1): - yield { - 'success': True, - 'number': number, - 'file_name': 'data.json', - 'data': line, - 'data_type': data_type, - 'url': url, - 'encoding': encoding, - } - if self.sample and number > 9: + if self.sample and number > self.MAX_SAMPLE: + break + yield from self._parse_json_item(number, line, data_type, url, encoding) + + def parse_json_array(self, f, data_type, url, encoding='utf-8', array_field_name='releases'): + for number, item in enumerate(ijson.items(f, '{}.item'.format(array_field_name)), 1): + if self.sample and number > self.MAX_SAMPLE: break + yield from self._parse_json_item(number, self.json_dumps(item), data_type, url, encoding) @classmethod def from_crawler(cls, crawler, *args, **kwargs): @@ -171,14 +198,15 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') Handling response with JSON data in ZIP files :param str file_format: The zipped files format. If this is set to 'json_lines', then the zipped file will be - slitted by lines before send it to kingfisher-process and only the zip file will be - stored as file. + splitted by lines before send it to kingfisher-process and only the zip file will be + stored as file. If it is set to 'release_package' the zipped file will be splitted by + releases. :param response response: the response that contains the zip file. :param str data_type: the zipped files data_type :param str encoding: the zipped files encoding. Default to utf-8 """ if response.status == 200: - if file_format == 'json_lines': + if file_format: self.save_response_to_disk(response, 'file.zip') zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): @@ -188,6 +216,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') data = zip_file.open(finfo.filename) if file_format == 'json_lines': yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) + if file_format == 'release_package': + yield from self.parse_json_array(data, data_type, response.request.url, encoding=encoding) else: yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url, encoding=encoding) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index 6a4ae297..eb853664 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -17,29 +17,25 @@ class ArgentinaBuenosAires(BaseSpider): def start_requests(self): yield scrapy.Request( url='https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', - meta={'type': 'meta'} + callback=self.parse_list ) - def parse(self, response): + @staticmethod + def parse_list(response): if response.status == 200: - if response.request.meta['type'] == 'meta': - data = json.loads(response.text) - for resource in data['result']['resources']: - if resource['format'].upper() == 'JSON': - yield scrapy.Request( - url=resource['url'], - meta={'type': 'data'} - ) - else: - zip_file = ZipFile(BytesIO(response.body)) - for finfo in zip_file.infolist(): - data = zip_file.open(finfo.filename).read() - yield self.save_data_to_disk(data, finfo.filename, url=response.request.url, - data_type='release_package') + data = json.loads(response.text) + for resource in data['result']['resources']: + if resource['format'].upper() == 'JSON': + yield scrapy.Request( + url=resource['url'] + ) else: yield { 'success': False, - 'file_name': hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', + 'file_name': 'list.json', 'url': response.request.url, 'errors': {'http_code': response.status} } + + def parse(self, response): + yield from self.parse_zipfile(response, 'release', file_format='release_package') diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 3f16e405..be03f42c 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -41,5 +41,5 @@ def parse_list(self, response): } def parse(self, response): - yield from self.parse_zipfile(response, data_type='record_package_json_lines', + yield from self.parse_zipfile(response, data_type='record_package', file_format='json_lines', encoding='iso-8859-1') From 2cec7bb00246949b54a5706d162f18253e61dd06 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 11:18:43 -0400 Subject: [PATCH 02/14] Add package each X releases Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 41 +++++++++++++++++-- .../spiders/argentina_buenos_aires.py | 9 ++-- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 3533ca0a..7091cb95 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,6 +1,7 @@ import hashlib import json import os +from copy import copy from datetime import datetime from decimal import Decimal from io import BytesIO @@ -40,6 +41,7 @@ class BaseSpider(scrapy.Spider): """ MAX_SAMPLE = 10 + MAX_RELEASES_PER_PACKAGE = 100 def __init__(self, sample=None, note=None, from_date=None, until_date=None, *args, **kwargs): super().__init__(*args, **kwargs) @@ -175,11 +177,41 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8'): break yield from self._parse_json_item(number, line, data_type, url, encoding) - def parse_json_array(self, f, data_type, url, encoding='utf-8', array_field_name='releases'): - for number, item in enumerate(ijson.items(f, '{}.item'.format(array_field_name)), 1): + def get_package(self, f, array_name): + package = {'extensions': []} + for prefix, event, value in ijson.parse(f): + if prefix and 'map' not in event and array_name not in prefix: + if 'extensions' in prefix: + if value: + package['extensions'].append(value) + elif '.' in prefix: + object_name = prefix.split('.')[0] + object_field = prefix.split('.')[1] + if object_name not in package: + package[object_name] = {} + print(object_name, object_field, value) + package[object_name][object_field] = value + else: + package[prefix] = value + if not package['extensions']: + del(package['extensions']) + return package + + def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases'): + last_number = 0 + package = self.get_package(f_package, array_field_name) + package[array_field_name] = [] + for number, item in enumerate(ijson.items(f_list, '{}.item'.format(array_field_name)), 1): if self.sample and number > self.MAX_SAMPLE: break - yield from self._parse_json_item(number, self.json_dumps(item), data_type, url, encoding) + last_number = number + if len(package[array_field_name]) < self.MAX_RELEASES_PER_PACKAGE: + package[array_field_name].append(item) + else: + yield from self._parse_json_item(number, self.json_dumps(package), data_type, url, encoding) + package[array_field_name] = [] + if package[array_field_name]: + yield from self._parse_json_item(last_number + 1, self.json_dumps(package), data_type, url, encoding) @staticmethod def json_dumps(data): @@ -220,7 +252,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') if file_format == 'json_lines': yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) if file_format == 'release_package': - yield from self.parse_json_array(data, data_type, response.request.url, encoding=encoding) + data_package = zip_file.open(finfo.filename) + yield from self.parse_json_array(data_package, data, data_type, response.request.url, encoding=encoding) else: yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url, encoding=encoding) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index eb853664..4b7f6d81 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -1,14 +1,11 @@ -import hashlib import json -from io import BytesIO -from zipfile import ZipFile import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import ZipSpider -class ArgentinaBuenosAires(BaseSpider): +class ArgentinaBuenosAires(ZipSpider): name = 'argentina_buenos_aires' start_urls = ['https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras'] # the data list service takes too long to be downloaded, so we increase the download timeout @@ -38,4 +35,4 @@ def parse_list(response): } def parse(self, response): - yield from self.parse_zipfile(response, 'release', file_format='release_package') + yield from self.parse_zipfile(response, 'release_package', file_format='release_package') From 78f61d879501b156fcdd01f1e2a4c7c348ae6387 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 11:32:01 -0400 Subject: [PATCH 03/14] Remove extra print Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index c0e37884..514c28aa 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -188,7 +188,6 @@ def get_package(self, f, array_name): object_field = prefix.split('.')[1] if object_name not in package: package[object_name] = {} - print(object_name, object_field, value) package[object_name][object_field] = value else: package[prefix] = value From 4446890f2d91cdc37d18d91e11192b88715a28ef Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 11:34:16 -0400 Subject: [PATCH 04/14] Update base_spider Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 514c28aa..4bc87120 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -223,6 +223,7 @@ def default(obj): return json.dumps(data, default=default) + class ZipSpider(BaseSpider): def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): @@ -250,7 +251,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) if file_format == 'release_package': data_package = zip_file.open(finfo.filename) - yield from self.parse_json_array(data_package, data, data_type, response.request.url, encoding=encoding) + yield from self.parse_json_array(data_package, data, data_type, response.request.url, + encoding=encoding) else: yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url, encoding=encoding) From 95c4968715a017e352a0be50b9681633feffb12a Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 14:18:44 -0400 Subject: [PATCH 05/14] Fix base spider Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 4bc87120..fe26f164 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -249,7 +249,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') data = zip_file.open(finfo.filename) if file_format == 'json_lines': yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) - if file_format == 'release_package': + elif file_format == 'release_package': data_package = zip_file.open(finfo.filename) yield from self.parse_json_array(data_package, data, data_type, response.request.url, encoding=encoding) From 5673f2aa1885a8b968604a87f733b60c26450962 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 15:03:25 -0400 Subject: [PATCH 06/14] Add tests Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 16 ++++++++------ tests/test_base_spider.py | 37 +++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index fe26f164..e7315d76 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -4,6 +4,7 @@ from datetime import datetime from decimal import Decimal from io import BytesIO +from math import ceil from zipfile import ZipFile import ijson @@ -176,7 +177,8 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8'): break yield from self._parse_json_item(number, line, data_type, url, encoding) - def get_package(self, f, array_name): + @staticmethod + def get_package(f, array_name): package = {'extensions': []} for prefix, event, value in ijson.parse(f): if prefix and 'map' not in event and array_name not in prefix: @@ -196,20 +198,20 @@ def get_package(self, f, array_name): return package def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases'): - last_number = 0 + packages = 0 package = self.get_package(f_package, array_field_name) package[array_field_name] = [] - for number, item in enumerate(ijson.items(f_list, '{}.item'.format(array_field_name)), 1): + for number, item in enumerate(ijson.items(f_list, '{}.item'.format(array_field_name))): if self.sample and number > self.MAX_SAMPLE: break - last_number = number if len(package[array_field_name]) < self.MAX_RELEASES_PER_PACKAGE: package[array_field_name].append(item) else: - yield from self._parse_json_item(number, self.json_dumps(package), data_type, url, encoding) - package[array_field_name] = [] + packages += 1 + yield from self._parse_json_item(packages, self.json_dumps(package), data_type, url, encoding) + package[array_field_name].clear() if package[array_field_name]: - yield from self._parse_json_item(last_number + 1, self.json_dumps(package), data_type, url, encoding) + yield from self._parse_json_item(packages + 1, self.json_dumps(package), data_type, url, encoding) @staticmethod def json_dumps(data): diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index 01ee92b2..ef8cab99 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -223,9 +223,8 @@ def test_parse_zipfile_json_lines(): tmp = os.path.join(files_store, 'test/20010203_040506') os.makedirs(tmp) with open(tmp + "test.json", 'w') as f: - f.write('{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}' - '\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}' - '\n{"key": "value"}') + for i in range(10): + f.write('{"key": "value"}\n') with ZipFile(tmp + '/test.zip', 'w') as z: z.write(tmp + "test.json") with open(tmp + '/test.zip', 'rb') as z: @@ -242,6 +241,38 @@ def test_parse_zipfile_json_lines(): assert total == 10 +def test_parse_zipfile_release_package(): + response = text.TextResponse('test') + response.status = 200 + response.request = Mock() + response.request.meta = {'kf_filename': 'test.json'} + response.request.url = 'url' + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + tmp = os.path.join(files_store, 'test/20010203_040506') + os.makedirs(tmp) + with open(tmp + "test.json", 'w') as f: + release = {'releases': [], 'publisher': {'name': 'test'}, + 'extensions': ['a', 'b'], 'license': 'test'} + for i in range(110): + release['releases'].append({'key': 'value'}) + json.dump(release, f) + with ZipFile(tmp + '/test.zip', 'w') as z: + z.write(tmp + "test.json") + with open(tmp + '/test.zip', 'rb') as z: + response = response.replace(body=z.read()) + spider = spider_with_crawler(spider_class=ZipSpider) + spider.crawler.settings['FILES_STORE'] = files_store + for number, actual in enumerate(spider.parse_zipfile(response, None, file_format='release_package'), 1): + assert actual['success'] is True and actual['number'] == number + spider.sample = True + total = 0 + for item in spider.parse_zipfile(response, None, file_format='release_package'): + total = total + 1 + assert item['success'] is True and item['number'] == total + assert total == 1 + + def test_date_arguments(): test_date = '2000-01-01' error_message = "time data 'test' does not match format '%Y-%m-%d'" From 7053168c99314c36376377bfecc9bd0cf4cf7c17 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 27 Apr 2020 15:05:55 -0400 Subject: [PATCH 07/14] Removed unused methods Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index e7315d76..caf5c8b0 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -4,7 +4,6 @@ from datetime import datetime from decimal import Decimal from io import BytesIO -from math import ceil from zipfile import ZipFile import ijson From 7d406748a36cf073f2cff93a80c23c324324d6f2 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 28 Apr 2020 20:58:42 -0400 Subject: [PATCH 08/14] Update parse array method Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 84 +++++++------------ .../spiders/argentina_buenos_aires.py | 16 ++-- kingfisher_scrapy/util.py | 53 ++++++++++++ tests/test_base_spider.py | 12 ++- 4 files changed, 103 insertions(+), 62 deletions(-) create mode 100644 kingfisher_scrapy/util.py diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index caf5c8b0..1c4197c8 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -2,13 +2,13 @@ import json import os from datetime import datetime -from decimal import Decimal from io import BytesIO from zipfile import ZipFile import ijson import scrapy +from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import SpiderArgumentError @@ -158,9 +158,8 @@ def _get_crawl_path(self): name += '_sample' return os.path.join(name, self.get_start_time('%Y%m%d_%H%M%S')) - @staticmethod - def _parse_json_item(number, line, data_type, url, encoding): - yield { + def _build_file_item(self, number, line, data_type, url, encoding): + return { 'success': True, 'number': number, 'file_name': 'data.json', @@ -174,67 +173,44 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8'): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: break - yield from self._parse_json_item(number, line, data_type, url, encoding) + if isinstance(line, bytes): + line = line.decode() + yield self._build_file_item(number, line, data_type, url, encoding) - @staticmethod - def get_package(f, array_name): - package = {'extensions': []} - for prefix, event, value in ijson.parse(f): - if prefix and 'map' not in event and array_name not in prefix: - if 'extensions' in prefix: - if value: - package['extensions'].append(value) - elif '.' in prefix: - object_name = prefix.split('.')[0] - object_field = prefix.split('.')[1] - if object_name not in package: - package[object_name] = {} - package[object_name][object_field] = value - else: - package[prefix] = value - if not package['extensions']: - del(package['extensions']) + def get_package(self, f, array_name): + """ + Returns the package data from a array_name_package object + """ + package = {} + for item in util.items(ijson.parse(f), '', array_name=array_name): + package.update(item) return package def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases'): - packages = 0 - package = self.get_package(f_package, array_field_name) - package[array_field_name] = [] - for number, item in enumerate(ijson.items(f_list, '{}.item'.format(array_field_name))): - if self.sample and number > self.MAX_SAMPLE: - break - if len(package[array_field_name]) < self.MAX_RELEASES_PER_PACKAGE: - package[array_field_name].append(item) - else: - packages += 1 - yield from self._parse_json_item(packages, self.json_dumps(package), data_type, url, encoding) - package[array_field_name].clear() - if package[array_field_name]: - yield from self._parse_json_item(packages + 1, self.json_dumps(package), data_type, url, encoding) + if self.sample: + size = self.MAX_SAMPLE + else: + size = self.MAX_RELEASES_PER_PACKAGE - @staticmethod - def json_dumps(data): - """ - From ocdskit, returns the data as JSON. - """ - def default(obj): - if isinstance(obj, Decimal): - return float(obj) - raise TypeError('%s is not JSON serializable' % repr(obj)) + package = self.get_package(f_package, array_field_name) - return json.dumps(data, default=default) + for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): + package[array_field_name] = [item for item in items if item is not None] + yield self._build_file_item(number, json.dumps(package, default=util.default), data_type, url, encoding) + if self.sample: + break class ZipSpider(BaseSpider): - def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): """ Handling response with JSON data in ZIP files - :param str file_format: The zipped files format. If this is set to 'json_lines', then the zipped file will be - splitted by lines before send it to kingfisher-process and only the zip file will be - stored as file. If it is set to 'release_package' the zipped file will be splitted by - releases. + :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped + file will be yielded separately. If this is set to "release_package", then the releases + will be re-packaged in groups of :const:~kingfisher_scrapy.base_spider.BaseSpider. + MAX_RELEASES_PER_PACKAGE and yielded. In both cases, only the zipped file will be saved + to disk. If this is not set, the file will be yielded and saved to disk. :param response response: the response that contains the zip file. :param str data_type: the zipped files data_type :param str encoding: the zipped files encoding. Default to utf-8 @@ -251,8 +227,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') if file_format == 'json_lines': yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) elif file_format == 'release_package': - data_package = zip_file.open(finfo.filename) - yield from self.parse_json_array(data_package, data, data_type, response.request.url, + package = zip_file.open(finfo.filename) + yield from self.parse_json_array(package, data, data_type, response.request.url, encoding=encoding) else: yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url, diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index 4b7f6d81..fb9ccff2 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -6,6 +6,15 @@ class ArgentinaBuenosAires(ZipSpider): + """ + Bulk download documentation + https://data.buenosaires.gob.ar/dataset/buenos-aires-compras/archivo/2a3d077c-71b6-4ba7-8924-f3e38cf1b8fc + API documentation + https://data.buenosaires.gob.ar/acerca/ckan + Spider arguments + sample + Downloads the zip file and sends 10 releases to kingfisher process. + """ name = 'argentina_buenos_aires' start_urls = ['https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras'] # the data list service takes too long to be downloaded, so we increase the download timeout @@ -17,15 +26,12 @@ def start_requests(self): callback=self.parse_list ) - @staticmethod - def parse_list(response): + def parse_list(self, response): if response.status == 200: data = json.loads(response.text) for resource in data['result']['resources']: if resource['format'].upper() == 'JSON': - yield scrapy.Request( - url=resource['url'] - ) + yield scrapy.Request(url=resource['url']) else: yield { 'success': False, diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py new file mode 100644 index 00000000..511435b2 --- /dev/null +++ b/kingfisher_scrapy/util.py @@ -0,0 +1,53 @@ +import itertools +import json +from decimal import Decimal + +from ijson import utils, ObjectBuilder + + +@utils.coroutine +def items_basecoro(target, prefix, map_type=None, array_name=None): + """ + An couroutine dispatching native Python objects constructed from the events + under a given prefix. + """ + while True: + current, event, value = (yield) + if array_name and array_name in current: + continue + if current == prefix: + if event in ('start_map', 'start_array'): + builder = ObjectBuilder(map_type=map_type) + end_event = event.replace('start', 'end') + while (current, event) != (prefix, end_event): + builder.event(event, value) + current, event, value = (yield) + del builder.containers[:] + target.send(builder.value) + else: + target.send(value) + + +def items(events, prefix, map_type=None, array_name=None): + """Like ijson.items, but takes events generated via ijson.parse instead of + a file""" + return utils.coros2gen(events, (items_basecoro, (prefix, ), {'map_type': map_type, 'array_name': array_name})) + + +def default(obj): + """ + From ocdskit, returns the data as JSON. + """ + if isinstance(obj, Decimal): + return float(obj) + try: + iter(obj) + except TypeError: + pass + return json.JSONEncoder().default(obj) + + +# See `grouper` recipe: https://docs.python.org/3.8/library/itertools.html#recipes +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index ef8cab99..dbfdab5d 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -253,7 +253,7 @@ def test_parse_zipfile_release_package(): os.makedirs(tmp) with open(tmp + "test.json", 'w') as f: release = {'releases': [], 'publisher': {'name': 'test'}, - 'extensions': ['a', 'b'], 'license': 'test'} + 'extensions': ['a', 'b'], 'license': 'test', 'extra': 1.1} for i in range(110): release['releases'].append({'key': 'value'}) json.dump(release, f) @@ -263,13 +263,19 @@ def test_parse_zipfile_release_package(): response = response.replace(body=z.read()) spider = spider_with_crawler(spider_class=ZipSpider) spider.crawler.settings['FILES_STORE'] = files_store - for number, actual in enumerate(spider.parse_zipfile(response, None, file_format='release_package'), 1): - assert actual['success'] is True and actual['number'] == number + actual = spider.parse_zipfile(response, None, file_format='release_package').__next__() + data = json.loads(actual['data']) + assert actual['success'] is True and actual['number'] == 1 + assert data['publisher']['name'] == 'test' + assert data['extensions'] == ['a', 'b'] + assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE spider.sample = True total = 0 for item in spider.parse_zipfile(response, None, file_format='release_package'): total = total + 1 + data = json.loads(item['data']) assert item['success'] is True and item['number'] == total + assert len(data['releases']) == spider.MAX_SAMPLE assert total == 1 From 6744a91cacc2c709920d56a5f2313313466d2b46 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 30 Apr 2020 14:36:54 -0400 Subject: [PATCH 09/14] Update docs Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 2 +- kingfisher_scrapy/util.py | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 8bdb9984..aeb698fb 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -209,7 +209,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped file will be yielded separately. If this is set to "release_package", then the releases will be re-packaged in groups of :const:~kingfisher_scrapy.base_spider.BaseSpider. - MAX_RELEASES_PER_PACKAGE and yielded. In both cases, only the zipped file will be saved + `~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. In both cases, only the zipped file will be saved to disk. If this is not set, the file will be yielded and saved to disk. :param response response: the response that contains the zip file. :param str data_type: the zipped files data_type diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index 511435b2..7317da91 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -2,14 +2,14 @@ import json from decimal import Decimal -from ijson import utils, ObjectBuilder +from ijson import ObjectBuilder, utils @utils.coroutine def items_basecoro(target, prefix, map_type=None, array_name=None): """ - An couroutine dispatching native Python objects constructed from the events - under a given prefix. + This is copied from ``ijson/common.py``. An ``array_name`` argument is added. If the ``array_name`` is in the + current path, the current event is skipped. Otherwise, the method is identical. """ while True: current, event, value = (yield) @@ -29,21 +29,28 @@ def items_basecoro(target, prefix, map_type=None, array_name=None): def items(events, prefix, map_type=None, array_name=None): - """Like ijson.items, but takes events generated via ijson.parse instead of - a file""" - return utils.coros2gen(events, (items_basecoro, (prefix, ), {'map_type': map_type, 'array_name': array_name})) + """ + This is copied from ``ijson/common.py``. An ``array_name`` argument is added, which is passed as a keyword argument + to :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. + + """ + return utils.coros2gen(events, + (items_basecoro, (prefix,), {'map_type': map_type, 'array_name': array_name}) + ) def default(obj): """ - From ocdskit, returns the data as JSON. + Dumps JSON to a string, and returns it. """ if isinstance(obj, Decimal): return float(obj) try: - iter(obj) + iterable = iter(obj) except TypeError: pass + else: + return list(iterable) return json.JSONEncoder().default(obj) From 317250be365b3449651d4bb41b1a11fd765d0610 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 30 Apr 2020 14:51:42 -0400 Subject: [PATCH 10/14] Update docs Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index aeb698fb..c0eda20d 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -209,8 +209,9 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped file will be yielded separately. If this is set to "release_package", then the releases will be re-packaged in groups of :const:~kingfisher_scrapy.base_spider.BaseSpider. - `~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. In both cases, only the zipped file will be saved - to disk. If this is not set, the file will be yielded and saved to disk. + `~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. + In both cases, only the zipped file will be saved to disk. If this is not set, the file + will be yielded and saved to disk. :param response response: the response that contains the zip file. :param str data_type: the zipped files data_type :param str encoding: the zipped files encoding. Default to utf-8 From cfba69b3b68dfc3972125566451011ac63c319f9 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 30 Apr 2020 15:23:37 -0400 Subject: [PATCH 11/14] Add ijson to requirements Signed-off-by: Yohanna Lisnichuk --- requirements.in | 1 + requirements.txt | 1 + requirements_dev.txt | 1 + 3 files changed, 3 insertions(+) diff --git a/requirements.in b/requirements.in index ab64af98..34c50ac3 100644 --- a/requirements.in +++ b/requirements.in @@ -5,3 +5,4 @@ rarfile requests Scrapy scrapyd-client +ijson>=3 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6541b823..669121e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ cryptography==2.8 # via pyopenssl, scrapy, service-identity cssselect==1.1.0 # via parsel, scrapy hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests +ijson==3.0.3 incremental==17.5.0 # via twisted lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy diff --git a/requirements_dev.txt b/requirements_dev.txt index 3d2cb46d..1ebbde86 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -20,6 +20,7 @@ entrypoints==0.3 # via flake8 flake8==3.7.9 hyperlink==19.0.0 idna==2.8 +ijson==3.0.3 importlib-metadata==1.3.0 # via pluggy, pytest incremental==17.5.0 isort==4.3.21 From 3980170cd51a4aed38b95c4614b986ffdbe7a78d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 30 Apr 2020 15:56:04 -0400 Subject: [PATCH 12/14] Fix :const: syntax --- kingfisher_scrapy/base_spider.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index c0eda20d..77927eb5 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -207,11 +207,10 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') Handling response with JSON data in ZIP files :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped - file will be yielded separately. If this is set to "release_package", then the releases - will be re-packaged in groups of :const:~kingfisher_scrapy.base_spider.BaseSpider. - `~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. - In both cases, only the zipped file will be saved to disk. If this is not set, the file - will be yielded and saved to disk. + file will be yielded separately. If this is set to "release_package", then the releases will be re-packaged + in groups of :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. In + both cases, only the zipped file will be saved to disk. If this is not set, the file will be yielded and + saved to disk. :param response response: the response that contains the zip file. :param str data_type: the zipped files data_type :param str encoding: the zipped files encoding. Default to utf-8 From 5dc636cec2be69c6b2c50f9277ef35876a0cc752 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 30 Apr 2020 15:56:24 -0400 Subject: [PATCH 13/14] Use identical whitespace to original ijson method, for easier visual comparison --- kingfisher_scrapy/util.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index 7317da91..b78fe0ee 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -32,11 +32,10 @@ def items(events, prefix, map_type=None, array_name=None): """ This is copied from ``ijson/common.py``. An ``array_name`` argument is added, which is passed as a keyword argument to :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. - """ return utils.coros2gen(events, - (items_basecoro, (prefix,), {'map_type': map_type, 'array_name': array_name}) - ) + (items_basecoro, (prefix,), {'map_type': map_type, 'array_name': array_name}) # noqa: E128 + ) def default(obj): From f64e2965a270cf52ffc86423819d1bf317e6b26b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 30 Apr 2020 15:58:09 -0400 Subject: [PATCH 14/14] Upgrade pip-tools --- requirements.txt | 5 ++++- requirements_dev.txt | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 669121e8..16b7f6ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ automat==0.8.0 # via twisted certifi==2019.11.28 # via requests cffi==1.13.2 # via cryptography chardet==3.0.4 # via requests +click==7.1.2 # via pip-tools constantly==15.1.0 # via twisted cryptography==2.8 # via pyopenssl, scrapy, service-identity cssselect==1.1.0 # via parsel, scrapy @@ -18,6 +19,7 @@ ijson==3.0.3 incremental==17.5.0 # via twisted lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy +pip-tools==5.1.0 protego==0.1.16 # via scrapy pyasn1-modules==0.2.7 # via service-identity pyasn1==0.4.8 # via pyasn1-modules, service-identity @@ -31,11 +33,12 @@ requests==2.22.0 scrapy==1.8.0 scrapyd-client==1.1.0 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, parsel, pip-tools, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests w3lib==1.21.0 # via parsel, scrapy zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: +# pip # setuptools diff --git a/requirements_dev.txt b/requirements_dev.txt index 1ebbde86..1be8d3f0 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -9,7 +9,7 @@ automat==0.8.0 certifi==2019.11.28 cffi==1.13.2 chardet==3.0.4 -click==7.0 # via pip-tools +click==7.1.2 constantly==15.1.0 coverage==5.0.3 # via coveralls, pytest-cov coveralls==2.0.0 @@ -29,7 +29,7 @@ mccabe==0.6.1 # via flake8 more-itertools==8.0.2 # via pytest, zipp packaging==19.2 # via pytest parsel==1.5.2 -pip-tools==4.3.0 +pip-tools==5.1.0 pluggy==0.13.1 # via pytest protego==0.1.16 py==1.8.0 # via pytest @@ -59,4 +59,5 @@ zipp==0.6.0 # via importlib-metadata zope.interface==4.7.1 # The following packages are considered to be unsafe in a requirements file: +# pip # setuptools