From c1c8a7aff0b01238da521b645e4b1add1380e8d0 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 19 May 2020 10:44:44 -0400 Subject: [PATCH 01/13] Add KingfisherStoreFiles extension to avoid storing files in spiders Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 61 ++++++-------------- kingfisher_scrapy/extensions.py | 50 ++++++++++++++++ kingfisher_scrapy/settings.py | 3 +- tests/test_base_spider.py | 97 ++----------------------------- tests/test_extensions.py | 99 +++++++++++++++++++++++++++++++- 5 files changed, 174 insertions(+), 136 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index ffd6bad1c..be036ce26 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -100,63 +100,40 @@ def get_local_file_path_excluding_filestore(self, filename): def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8'): """ - Writes the response's body to the filename in the crawl's directory. - - Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. + Sends a dict to KingfisherStoreFiles to store the data """ - return self._save_response_to_disk(response.body, filename, response.request.url, data_type, encoding) + return { + 'data': response.body, + 'file_name': filename, + 'url': response.request.url, + 'data_type': data_type, + 'encoding': encoding + } def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8'): """ - Writes the data to the filename in the crawl's directory. - - Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. - """ - return self._save_response_to_disk(data, filename, url, data_type, encoding) - - def get_start_time(self, format): - """ - Returns the formatted start time of the crawl. + Sends a dict to KingfisherStoreFiles to store the data """ - return self.crawler.stats.get_value('start_time').strftime(format) - - def _save_response_to_disk(self, data, filename, url, data_type, encoding): - self._write_file(filename, data) - - metadata = { + return { + 'data': data, + 'file_name': filename, 'url': url, 'data_type': data_type, - 'encoding': encoding, + 'encoding': encoding } - self._write_file(filename + '.fileinfo', metadata) - - metadata['success'] = True - metadata['file_name'] = filename - - return metadata - - def _write_file(self, filename, data): - path = self.get_local_file_path_including_filestore(filename) - os.makedirs(os.path.dirname(path), exist_ok=True) - - if isinstance(data, bytes): - mode = 'wb' - else: - mode = 'w' - - with open(path, mode) as f: - if isinstance(data, (bytes, str)): - f.write(data) - else: - json.dump(data, f) - def _get_crawl_path(self): name = self.name if self.sample: name += '_sample' return os.path.join(name, self.get_start_time('%Y%m%d_%H%M%S')) + def get_start_time(self, format): + """ + Returns the formatted start time of the crawl. + """ + return self.crawler.stats.get_value('start_time').strftime(format) + def _build_file_item(self, number, line, data_type, url, encoding): return { 'success': True, diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 93aa97f0e..e53b73cb1 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -8,6 +8,56 @@ # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension +class KingfisherStoreFiles: + def __init__(self, directory, stats): + self.directory = directory + self.stats = stats + + @classmethod + def from_crawler(cls, crawler): + directory = crawler.settings['FILES_STORE'] + extension = cls(directory, crawler.stats) + crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) + return extension + + def item_scraped(self, item, spider): + """ + Writes the response's body to the filename in the crawl's directory. + + Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. + """ + self._write_file(item['file_name'], item['data'], spider) + + metadata = { + 'url': item['url'], + 'data_type': item['data_type'], + 'encoding': item['encoding'], + } + + self._write_file(item['file_name'] + '.fileinfo', metadata, spider) + + metadata['success'] = True + metadata['file_name'] = item['file_name'] + item['success'] = True + + return metadata + + def _write_file(self, filename, data, spider): + path = spider.get_local_file_path_including_filestore(filename) + os.makedirs(os.path.dirname(path), exist_ok=True) + + if isinstance(data, bytes): + mode = 'wb' + else: + mode = 'w' + + with open(path, mode) as f: + if isinstance(data, (bytes, str)): + f.write(data) + else: + json.dump(data, f) + + class KingfisherAPI: def __init__(self, url, key, directory=None): """ diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 6c0a246ee..63f61d93b 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -67,7 +67,8 @@ # 'scrapy.extensions.telnet.TelnetConsole': None, #} EXTENSIONS = { - 'kingfisher_scrapy.extensions.KingfisherAPI': 0, + 'kingfisher_scrapy.extensions.KingfisherStoreFiles': 100, + 'kingfisher_scrapy.extensions.KingfisherAPI': 500, } # Configure item pipelines diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index dbfdab5d8..e2bc9aee3 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -52,91 +52,6 @@ def test_get_local_file_path_excluding_filestore(sample, expected): assert spider.get_local_file_path_excluding_filestore('file.json') == expected -@pytest.mark.parametrize('sample,path', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_save_response_to_disk(sample, path): - spider = spider_with_crawler(sample=sample) - - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - - response = Mock() - response.body = b'{"key": "value"}' - response.request = Mock() - response.request.url = 'https://example.com/remote.json' - - actual = spider.save_response_to_disk(response, 'file.json', data_type='release_package', - encoding='iso-8859-1') - - with open(os.path.join(files_store, path)) as f: - assert f.read() == '{"key": "value"}' - - with open(os.path.join(files_store, path + '.fileinfo')) as f: - assert json.load(f) == { - 'url': 'https://example.com/remote.json', - 'data_type': 'release_package', - 'encoding': 'iso-8859-1', - } - - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } - - -@pytest.mark.parametrize('sample,path', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_save_data_to_disk(sample, path): - spider = spider_with_crawler(sample=sample) - - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - - data = b'{"key": "value"}' - url = 'https://example.com/remote.json' - - actual = spider.save_data_to_disk(data, 'file.json', url=url, data_type='release_package', - encoding='iso-8859-1') - - with open(os.path.join(files_store, path)) as f: - assert f.read() == '{"key": "value"}' - - with open(os.path.join(files_store, path + '.fileinfo')) as f: - assert json.load(f) == { - 'url': 'https://example.com/remote.json', - 'data_type': 'release_package', - 'encoding': 'iso-8859-1', - } - - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } - - -def test_save_data_to_disk_with_existing_directory(): - spider = spider_with_crawler() - - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - os.makedirs(os.path.join(files_store, 'test/20010203_040506')) - - spider.save_data_to_disk(b'{"key": "value"}', 'file.json') # no FileExistsError exception - - def test_next_link(): url = 'https://example.com/remote.json' text_response = text.TextResponse('test') @@ -173,7 +88,7 @@ def test_parse_next_link_200(): spider = spider_with_crawler(spider_class=LinksSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_next_link(response, None).__next__() - assert actual['success'] is True and actual['file_name'] == 'test' + assert actual['file_name'] == 'test' for item in spider.parse_next_link(response, None): assert item @@ -209,7 +124,7 @@ def test_parse_zipfile_200(): spider = spider_with_crawler(spider_class=ZipSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None).__next__() - assert actual['success'] is True and actual['file_name'].find('.json') + assert actual['file_name'].find('.json') def test_parse_zipfile_json_lines(): @@ -232,12 +147,12 @@ def test_parse_zipfile_json_lines(): spider = spider_with_crawler(spider_class=ZipSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None, file_format='json_lines').__next__() - assert actual['success'] is True and actual['number'] == 1 + assert actual['number'] == 1 spider.sample = True total = 0 for item in spider.parse_zipfile(response, None, file_format='json_lines'): total = total + 1 - assert item['success'] is True and item['number'] == total + assert item['number'] == total assert total == 10 @@ -265,7 +180,7 @@ def test_parse_zipfile_release_package(): spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None, file_format='release_package').__next__() data = json.loads(actual['data']) - assert actual['success'] is True and actual['number'] == 1 + assert actual['number'] == 1 assert data['publisher']['name'] == 'test' assert data['extensions'] == ['a', 'b'] assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE @@ -274,7 +189,7 @@ def test_parse_zipfile_release_package(): for item in spider.parse_zipfile(response, None, file_format='release_package'): total = total + 1 data = json.loads(item['data']) - assert item['success'] is True and item['number'] == total + assert item['number'] == total assert len(data['releases']) == spider.MAX_SAMPLE assert total == 1 diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 41cbce848..8fa9db5a6 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -1,9 +1,12 @@ +import json +import os +from tempfile import TemporaryDirectory from unittest.mock import Mock, patch import pytest from scrapy.exceptions import NotConfigured -from kingfisher_scrapy.extensions import KingfisherAPI +from kingfisher_scrapy.extensions import KingfisherAPI, KingfisherStoreFiles from tests import spider_with_crawler @@ -57,8 +60,11 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d if directory: spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx')) + extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + extension = KingfisherAPI.from_crawler(spider.crawler) - spider.save_data_to_disk(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json') + extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json', + url='https://example.com/remote.json'), spider) with patch('requests.post') as mocked: response = Mock() @@ -272,3 +278,92 @@ def test_spider_closed_other_reason(tmpdir): extension.spider_closed(spider, 'xxx') mocked.assert_not_called() + + +@pytest.mark.parametrize('sample,path', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_save_response_to_disk(sample, path, tmpdir): + spider = spider_after_open(tmpdir, sample=sample) + + extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + + response = Mock() + response.body = b'{"key": "value"}' + response.request = Mock() + response.request.url = 'https://example.com/remote.json' + + actual = extension_store.item_scraped(spider.save_response_to_disk(response, 'file.json', + data_type='release_package', + encoding='iso-8859-1'), spider) + + with open(os.path.join(files_store, path)) as f: + assert f.read() == '{"key": "value"}' + + with open(os.path.join(files_store, path + '.fileinfo')) as f: + assert json.load(f) == { + 'url': 'https://example.com/remote.json', + 'data_type': 'release_package', + 'encoding': 'iso-8859-1', + } + + assert actual == { + 'success': True, + 'file_name': 'file.json', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + } + + +@pytest.mark.parametrize('sample,path', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_save_data_to_disk(sample, path): + spider = spider_with_crawler(sample=sample) + extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + + data = b'{"key": "value"}' + url = 'https://example.com/remote.json' + + actual = extension_store.item_scraped(spider.save_data_to_disk(data, 'file.json', url=url, + data_type='release_package', + encoding='iso-8859-1'), spider) + + with open(os.path.join(files_store, path)) as f: + assert f.read() == '{"key": "value"}' + + with open(os.path.join(files_store, path + '.fileinfo')) as f: + assert json.load(f) == { + 'url': 'https://example.com/remote.json', + 'data_type': 'release_package', + 'encoding': 'iso-8859-1', + } + + assert actual == { + 'success': True, + 'file_name': 'file.json', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + } + + +def test_save_data_to_disk_with_existing_directory(): + spider = spider_with_crawler() + extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + os.makedirs(os.path.join(files_store, 'test/20010203_040506')) + extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json'), + spider) # no FileExistsError exception From ca957417bfd6f8b54d070228d46c4a1f06c942cc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 19 May 2020 10:49:46 -0400 Subject: [PATCH 02/13] Removed unused variable Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/extensions.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index e53b73cb1..84e98feac 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -9,14 +9,13 @@ # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension class KingfisherStoreFiles: - def __init__(self, directory, stats): + def __init__(self, directory): self.directory = directory - self.stats = stats @classmethod def from_crawler(cls, crawler): directory = crawler.settings['FILES_STORE'] - extension = cls(directory, crawler.stats) + extension = cls(directory) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) return extension From 9eddc6664e800749f128103bc265aabe1529cce0 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 20 May 2020 15:28:00 -0400 Subject: [PATCH 03/13] Remove os related functions from base_spider Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 62 ++++++++++++-------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index be036ce26..fa1363ba7 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -86,72 +86,52 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider - def get_local_file_path_including_filestore(self, filename): + def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True): """ - Prepends Scrapy's storage directory and the crawl's relative directory to the filename. + Returns an item to yield, based on the response to a request. """ - return os.path.join(self.crawler.settings['FILES_STORE'], self._get_crawl_path(), filename) + return self.save_data_to_disk(response.body, filename, response.request.url, data_type, encoding, + post_to_api) - def get_local_file_path_excluding_filestore(self, filename): + def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True): """ - Prepends the crawl's relative directory to the filename. - """ - return os.path.join(self._get_crawl_path(), filename) - - def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8'): - """ - Sends a dict to KingfisherStoreFiles to store the data - """ - return { - 'data': response.body, - 'file_name': filename, - 'url': response.request.url, - 'data_type': data_type, - 'encoding': encoding - } - - def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8'): - """ - Sends a dict to KingfisherStoreFiles to store the data + Returns an item to yield """ return { 'data': data, 'file_name': filename, 'url': url, 'data_type': data_type, - 'encoding': encoding + 'encoding': encoding, + 'success': True, + 'post_to_api': post_to_api, } - def _get_crawl_path(self): - name = self.name - if self.sample: - name += '_sample' - return os.path.join(name, self.get_start_time('%Y%m%d_%H%M%S')) - def get_start_time(self, format): """ Returns the formatted start time of the crawl. """ return self.crawler.stats.get_value('start_time').strftime(format) - def _build_file_item(self, number, line, data_type, url, encoding): + def _build_file_item(self, number, line, data_type, url, encoding, file_name): return { 'success': True, 'number': number, - 'file_name': 'data.json', + 'file_name': file_name, 'data': line, 'data_type': data_type, 'url': url, 'encoding': encoding, + 'post_to_api': True } - def parse_json_lines(self, f, data_type, url, encoding='utf-8'): + def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: break if isinstance(line, bytes): line = line.decode(encoding=encoding) - yield self._build_file_item(number, line, data_type, url, encoding) + yield self._build_file_item(number, line, data_type, url, encoding, file_name) def get_package(self, f, array_name): """ @@ -162,7 +142,8 @@ def get_package(self, f, array_name): package.update(item) return package - def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases'): + def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', + array_field_name='releases', file_name='data.json'): if self.sample: size = self.MAX_SAMPLE else: @@ -172,7 +153,8 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) - yield self._build_file_item(number, json.dumps(package, default=util.default), data_type, url, encoding) + yield self._build_file_item(number, json.dumps(package, default=util.default), data_type, url, encoding, + file_name) if self.sample: break @@ -194,7 +176,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') if response.status == 200: if file_format: self.save_response_to_disk(response, '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')) - .hexdigest())) + .hexdigest()), + post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): filename = finfo.filename @@ -202,11 +185,12 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') filename += '.json' data = zip_file.open(finfo.filename) if file_format == 'json_lines': - yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) + yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding, + file_name=filename) elif file_format == 'release_package': package = zip_file.open(finfo.filename) yield from self.parse_json_array(package, data, data_type, response.request.url, - encoding=encoding) + encoding=encoding, file_name=filename) else: yield self.save_data_to_disk(data.read(), filename, data_type=data_type, url=response.request.url, encoding=encoding) From 1e355c565156d34af251e5532d9ae13441515422 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 20 May 2020 15:34:00 -0400 Subject: [PATCH 04/13] Update storefile extension Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/extensions.py | 58 +++++++++++------ kingfisher_scrapy/settings.py | 4 +- tests/test_base_spider.py | 33 ++-------- tests/test_extensions.py | 106 +++++++++++++++++--------------- 4 files changed, 102 insertions(+), 99 deletions(-) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 84e98feac..f730ecde9 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -8,7 +8,7 @@ # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension -class KingfisherStoreFiles: +class KingfisherFilesStore: def __init__(self, directory): self.directory = directory @@ -21,28 +21,25 @@ def from_crawler(cls, crawler): def item_scraped(self, item, spider): """ - Writes the response's body to the filename in the crawl's directory. + Writes the item's data to the filename in the crawl's directory. Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. """ - self._write_file(item['file_name'], item['data'], spider) - - metadata = { - 'url': item['url'], - 'data_type': item['data_type'], - 'encoding': item['encoding'], - } - - self._write_file(item['file_name'] + '.fileinfo', metadata, spider) - - metadata['success'] = True - metadata['file_name'] = item['file_name'] - item['success'] = True - - return metadata + if 'number' not in item: + self._write_file(item['file_name'], item['data'], spider) + metadata = { + 'url': item['url'], + 'data_type': item['data_type'], + 'encoding': item['encoding'], + } + self._write_file(item['file_name'] + '.fileinfo', metadata, spider) + item['path_including_file_store'] = self.get_local_file_path_including_filestore(item['file_name'], + spider) + item['path_excluding_file_store'] = self.get_local_file_path_excluding_filestore(item['file_name'], + spider) def _write_file(self, filename, data, spider): - path = spider.get_local_file_path_including_filestore(filename) + path = self.get_local_file_path_including_filestore(filename, spider) os.makedirs(os.path.dirname(path), exist_ok=True) if isinstance(data, bytes): @@ -56,6 +53,24 @@ def _write_file(self, filename, data, spider): else: json.dump(data, f) + def get_local_file_path_including_filestore(self, filename, spider): + """ + Prepends Scrapy's storage directory and the crawl's relative directory to the filename. + """ + return os.path.join(self.directory, self._get_crawl_path(spider), filename) + + def get_local_file_path_excluding_filestore(self, filename, spider): + """ + Prepends the crawl's relative directory to the filename. + """ + return os.path.join(self._get_crawl_path(spider), filename) + + def _get_crawl_path(self, spider): + name = spider.name + if spider.sample: + name += '_sample' + return os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S')) + class KingfisherAPI: def __init__(self, url, key, directory=None): @@ -103,6 +118,9 @@ def item_scraped(self, item, spider): If the Scrapy item indicates success, sends a Kingfisher Process API request to create either a Kingfisher Process file or file item. Otherwise, sends an API request to create a file error. """ + if not item.get('post_to_api', True): + return + data = { 'collection_source': spider.name, 'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'), @@ -127,11 +145,11 @@ def item_scraped(self, item, spider): # File else: if self.directory: - path = spider.get_local_file_path_excluding_filestore(item['file_name']) + path = item['path_excluding_file_store'] data['local_file_name'] = os.path.join(self.directory, path) files = {} else: - path = spider.get_local_file_path_including_filestore(item['file_name']) + path = item['path_including_file_store'] f = open(path, 'rb') files = {'file': (item['file_name'], f, 'application/json')} diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 63f61d93b..c572a1f64 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -67,7 +67,9 @@ # 'scrapy.extensions.telnet.TelnetConsole': None, #} EXTENSIONS = { - 'kingfisher_scrapy.extensions.KingfisherStoreFiles': 100, + 'kingfisher_scrapy.extensions.KingfisherFilesStore': 100, + # KingfisherAPI must have lower priority than KingfisherStoreFiles, because the file needs to be written before the + # request is sent to Kingfisher Process. 'kingfisher_scrapy.extensions.KingfisherAPI': 500, } diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index e2bc9aee3..d44d555fd 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -31,27 +31,6 @@ def test_sample_no_kwarg(): assert spider.sample is False -@pytest.mark.parametrize('sample,expected', [ - (None, 'data/test/20010203_040506/file.json'), - ('true', 'data/test_sample/20010203_040506/file.json'), -]) -def test_get_local_file_path_including_filestore(sample, expected): - spider = spider_with_crawler(sample=sample) - spider.crawler.settings['FILES_STORE'] = 'data' - - assert spider.get_local_file_path_including_filestore('file.json') == expected - - -@pytest.mark.parametrize('sample,expected', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_get_local_file_path_excluding_filestore(sample, expected): - spider = spider_with_crawler(sample=sample) - - assert spider.get_local_file_path_excluding_filestore('file.json') == expected - - def test_next_link(): url = 'https://example.com/remote.json' text_response = text.TextResponse('test') @@ -88,7 +67,7 @@ def test_parse_next_link_200(): spider = spider_with_crawler(spider_class=LinksSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_next_link(response, None).__next__() - assert actual['file_name'] == 'test' + assert actual['success'] is True and actual['file_name'] == 'test' for item in spider.parse_next_link(response, None): assert item @@ -124,7 +103,7 @@ def test_parse_zipfile_200(): spider = spider_with_crawler(spider_class=ZipSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None).__next__() - assert actual['file_name'].find('.json') + assert actual['success'] is True and actual['file_name'].find('.json') def test_parse_zipfile_json_lines(): @@ -147,12 +126,12 @@ def test_parse_zipfile_json_lines(): spider = spider_with_crawler(spider_class=ZipSpider) spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None, file_format='json_lines').__next__() - assert actual['number'] == 1 + assert actual['success'] is True and actual['number'] == 1 spider.sample = True total = 0 for item in spider.parse_zipfile(response, None, file_format='json_lines'): total = total + 1 - assert item['number'] == total + assert item['success'] is True and item['number'] == total assert total == 10 @@ -180,7 +159,7 @@ def test_parse_zipfile_release_package(): spider.crawler.settings['FILES_STORE'] = files_store actual = spider.parse_zipfile(response, None, file_format='release_package').__next__() data = json.loads(actual['data']) - assert actual['number'] == 1 + assert actual['success'] is True and actual['number'] == 1 assert data['publisher']['name'] == 'test' assert data['extensions'] == ['a', 'b'] assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE @@ -189,7 +168,7 @@ def test_parse_zipfile_release_package(): for item in spider.parse_zipfile(response, None, file_format='release_package'): total = total + 1 data = json.loads(item['data']) - assert item['number'] == total + assert item['success'] is True and item['number'] == total assert len(data['releases']) == spider.MAX_SAMPLE assert total == 1 diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 8fa9db5a6..a9d651bcd 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -6,7 +6,7 @@ import pytest from scrapy.exceptions import NotConfigured -from kingfisher_scrapy.extensions import KingfisherAPI, KingfisherStoreFiles +from kingfisher_scrapy.extensions import KingfisherAPI, KingfisherFilesStore from tests import spider_with_crawler @@ -59,8 +59,9 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d if directory: spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx')) + spider.crawler.settings['FILES_STORE'] = tmpdir - extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) extension = KingfisherAPI.from_crawler(spider.crawler) extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json', @@ -72,15 +73,18 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d response.status_code = 400 mocked.return_value = response - data = { - 'success': True, - 'file_name': 'file.json', - 'url': 'https://example.com/remote.json', + data = spider.save_data_to_disk( + data=None, + filename='file.json', + url='https://example.com/remote.json', # Specific to this test case. - 'data_type': 'release_package', - } - if encoding: - data['encoding'] = encoding + data_type='release_package', + encoding=encoding2 + ) + data['path_excluding_file_store'] = os.path.join(tmpdir, path) + data['path_including_file_store'] = os.path.join(tmpdir, path) + if directory: + data['path_excluding_file_store'] = tmpdir.join('xxx', path) extension.item_scraped(data, spider) @@ -139,18 +143,15 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok response.status_code = 400 mocked.return_value = response - data = { - 'success': True, - 'file_name': 'file.json', - 'url': 'https://example.com/remote.json', + data = spider._build_file_item( + 1, + b'{"key": "value"}', + url='https://example.com/remote.json', # Specific to this test case. - 'data_type': 'release_package', - 'number': 1, - 'data': b'{"key": "value"}', - } - if encoding: - data['encoding'] = encoding - + data_type='release_package', + encoding=encoding2, + file_name='data.json', + ) extension.item_scraped(data, spider) if not ok: @@ -165,13 +166,13 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, - 'file_name': 'file.json', + 'file_name': 'data.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'data_type': 'release_package', 'encoding': encoding2, 'number': 1, - 'data': b'{"key": "value"}', + 'data': b'{"key": "value"}' } if note: expected['collection_note'] = note @@ -286,21 +287,18 @@ def test_spider_closed_other_reason(tmpdir): ]) def test_save_response_to_disk(sample, path, tmpdir): spider = spider_after_open(tmpdir, sample=sample) - - extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) - with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store - + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) response = Mock() response.body = b'{"key": "value"}' response.request = Mock() response.request.url = 'https://example.com/remote.json' - actual = extension_store.item_scraped(spider.save_response_to_disk(response, 'file.json', - data_type='release_package', - encoding='iso-8859-1'), spider) + extension_store.item_scraped(spider.save_response_to_disk(response, 'file.json', + data_type='release_package', + encoding='iso-8859-1'), spider) with open(os.path.join(files_store, path)) as f: assert f.read() == '{"key": "value"}' @@ -312,14 +310,6 @@ def test_save_response_to_disk(sample, path, tmpdir): 'encoding': 'iso-8859-1', } - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } - @pytest.mark.parametrize('sample,path', [ (None, 'test/20010203_040506/file.json'), @@ -327,17 +317,18 @@ def test_save_response_to_disk(sample, path, tmpdir): ]) def test_save_data_to_disk(sample, path): spider = spider_with_crawler(sample=sample) - extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) + with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) data = b'{"key": "value"}' url = 'https://example.com/remote.json' - actual = extension_store.item_scraped(spider.save_data_to_disk(data, 'file.json', url=url, - data_type='release_package', - encoding='iso-8859-1'), spider) + extension_store.item_scraped(spider.save_data_to_disk(data, 'file.json', url=url, + data_type='release_package', + encoding='iso-8859-1'), spider) with open(os.path.join(files_store, path)) as f: assert f.read() == '{"key": "value"}' @@ -349,21 +340,34 @@ def test_save_data_to_disk(sample, path): 'encoding': 'iso-8859-1', } - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } - def test_save_data_to_disk_with_existing_directory(): spider = spider_with_crawler() - extension_store = KingfisherStoreFiles.from_crawler(spider.crawler) with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) os.makedirs(os.path.join(files_store, 'test/20010203_040506')) extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json'), spider) # no FileExistsError exception + + +@pytest.mark.parametrize('sample,expected', [ + (None, 'data/test/20010203_040506/file.json'), + ('true', 'data/test_sample/20010203_040506/file.json'), +]) +def test_get_local_file_path_including_filestore(sample, expected): + spider = spider_with_crawler(sample=sample) + spider.crawler.settings['FILES_STORE'] = 'data' + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + assert extension_store.get_local_file_path_including_filestore('file.json') == expected + + +@pytest.mark.parametrize('sample,expected', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_get_local_file_path_excluding_filestore(sample, expected): + spider = spider_with_crawler(sample=sample) + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + assert extension_store.get_local_file_path_excluding_filestore('file.json') == expected From f1c77e0898f941f0b8be8ede5d4d06c55f7d3efc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 20 May 2020 15:34:42 -0400 Subject: [PATCH 05/13] Update zip/tar affected spiders Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/digiwhist_base.py | 19 ++--------- kingfisher_scrapy/spiders/georgia_opendata.py | 32 +++---------------- 2 files changed, 8 insertions(+), 43 deletions(-) diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index 8141f1e67..e9c0f7aae 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -1,6 +1,7 @@ import json import os import tarfile +from io import BytesIO from kingfisher_scrapy.base_spider import BaseSpider @@ -10,24 +11,10 @@ class DigiwhistBase(BaseSpider): def parse(self, response): if response.status == 200: - save_file_name = self.get_local_file_path_including_filestore('file.tar.gz') - - # Create folder for data - os.makedirs(os.path.dirname(save_file_name), exist_ok=True) - - # Save original file - with open(save_file_name, "wb") as fp: - fp.write(response.body) - - # Save some extra info alongside that file - with open(save_file_name + '.fileinfo', 'w') as f: - f.write(json.dumps({ - 'url': response.request.url, - 'data_type': 'release_package_json_lines', - })) + yield self.save_response_to_disk(response, 'file.tar.gz', post_to_api=False) # Load a line at the time, pass it to API - with tarfile.open(save_file_name, "r:gz") as tar: + with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: with tar.extractfile(tar.getnames()[0]) as readfp: yield from self.parse_json_lines(readfp, 'release_package', self.start_urls[0]) else: diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index ed495cb9f..c33634727 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -1,11 +1,9 @@ -from zipfile import ZipFile - import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import ZipSpider -class GeorgiaOpenData(BaseSpider): +class GeorgiaOpenData(ZipSpider): name = 'georgia_opendata' custom_settings = { # This has to download a 400MB file so ..... @@ -14,32 +12,12 @@ class GeorgiaOpenData(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://opendata.spa.ge/json/allTenders.zip', - callback=self.parse_zip + url='http://opendata.spa.ge/json/allTenders.zip' ) - def parse_zip(self, response): + def parse(self, response): if response.status == 200: - - # Save original file - save_file_name = self.get_local_file_path_including_filestore('allTenders.zip') - with open(save_file_name, "wb") as fp: - fp.write(response.body) - - # Now extract each file one at a time, save to disk and pass to pipelines for processing - zip_file = ZipFile(save_file_name) - for finfo in zip_file.infolist(): - if finfo.filename.endswith('.json'): - data = zip_file.open(finfo.filename).read() - yield self.save_data_to_disk( - data, - finfo.filename.replace('/', '-'), - data_type='release_package', - url=response.request.url - ) - if self.sample: - return - + yield from self.parse_zipfile(response, 'release_package', file_format='release_package') else: yield { 'success': False, From 7ed9f13807db0e1c86f79574613ee337136fb894 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 20 May 2020 15:45:13 -0400 Subject: [PATCH 06/13] Remove unused imports Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 1 - kingfisher_scrapy/spiders/digiwhist_base.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index fa1363ba7..cfbebc24e 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,6 +1,5 @@ import hashlib import json -import os from datetime import datetime from io import BytesIO from zipfile import ZipFile diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index e9c0f7aae..574881599 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -1,5 +1,3 @@ -import json -import os import tarfile from io import BytesIO From 9930380449cb960ffeef03ca9e5f2e6b3582c934 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 20 May 2020 15:49:00 -0400 Subject: [PATCH 07/13] Fix extensions test Signed-off-by: Yohanna Lisnichuk --- tests/test_extensions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index a9d651bcd..3d19e5d49 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -360,7 +360,7 @@ def test_get_local_file_path_including_filestore(sample, expected): spider = spider_with_crawler(sample=sample) spider.crawler.settings['FILES_STORE'] = 'data' extension_store = KingfisherFilesStore.from_crawler(spider.crawler) - assert extension_store.get_local_file_path_including_filestore('file.json') == expected + assert extension_store.get_local_file_path_including_filestore('file.json', spider) == expected @pytest.mark.parametrize('sample,expected', [ @@ -370,4 +370,4 @@ def test_get_local_file_path_including_filestore(sample, expected): def test_get_local_file_path_excluding_filestore(sample, expected): spider = spider_with_crawler(sample=sample) extension_store = KingfisherFilesStore.from_crawler(spider.crawler) - assert extension_store.get_local_file_path_excluding_filestore('file.json') == expected + assert extension_store.get_local_file_path_excluding_filestore('file.json', spider) == expected From f05e63b97f3f5e176473739ca05e86a09eaeb5ea Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 21 May 2020 11:00:59 -0400 Subject: [PATCH 08/13] Add custom log formatter Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/log_formatter.py | 22 ++++++++++++++++++++++ kingfisher_scrapy/settings.py | 3 +++ 2 files changed, 25 insertions(+) create mode 100644 kingfisher_scrapy/log_formatter.py diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py new file mode 100644 index 000000000..0a0b21fa8 --- /dev/null +++ b/kingfisher_scrapy/log_formatter.py @@ -0,0 +1,22 @@ +import logging + +from scrapy import logformatter +from twisted.python.failure import Failure + + +class KingfisherCustomLoggerFormatter(logformatter.LogFormatter): + # from https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped + def scraped(self, item, response, spider): + """Logs a message when an item is scraped by a spider.""" + if isinstance(response, Failure): + src = response.getErrorMessage() + else: + src = response + return { + 'level': logging.DEBUG, + # we dont log the item content just the source + 'msg': "Scraped from %(src)s", + 'args': { + 'src': src + } + } diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index c572a1f64..79ee6847b 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -88,6 +88,9 @@ # instead of files to Kingfisher Process' API. To enable that, set this to the absolute path to the `FILES_STORE`. KINGFISHER_API_LOCAL_DIRECTORY = os.getenv('KINGFISHER_API_LOCAL_DIRECTORY') +# We use a custom formatter to avoid logging a item content +LOG_FORMATTER = 'kingfisher_scrapy.log_formatter.KingfisherCustomLoggerFormatter' + KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN') KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET') From eae53f2e5a42afd68f2ab4922aa5778aeff5abbd Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 21 May 2020 11:21:03 -0400 Subject: [PATCH 09/13] Add post_to_api test Signed-off-by: Yohanna Lisnichuk --- tests/test_extensions.py | 55 +++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 3d19e5d49..b01673b84 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -54,7 +54,9 @@ def test_from_crawler_missing_arguments(api_url, api_key): @pytest.mark.parametrize('encoding,encoding2', [(None, 'utf-8'), ('iso-8859-1', 'iso-8859-1')]) @pytest.mark.parametrize('directory', [False, True]) @pytest.mark.parametrize('ok', [True, False]) -def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, tmpdir, caplog): +@pytest.mark.parametrize('post_to_api', [True, True, False]) +def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, tmpdir, caplog, + post_to_api): spider = spider_after_open(tmpdir, sample=sample, note=note) if directory: @@ -79,7 +81,8 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d url='https://example.com/remote.json', # Specific to this test case. data_type='release_package', - encoding=encoding2 + encoding=encoding2, + post_to_api=post_to_api ) data['path_excluding_file_store'] = os.path.join(tmpdir, path) data['path_including_file_store'] = os.path.join(tmpdir, path) @@ -89,12 +92,15 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d extension.item_scraped(data, spider) if not ok: - message = 'Failed to post [https://example.com/remote.json]. API status code: 400' + if not post_to_api: + assert len(caplog.records) == 0 + else: + message = 'Failed to post [https://example.com/remote.json]. API status code: 400' - assert len(caplog.records) == 1 - assert caplog.records[0].name == 'test' - assert caplog.records[0].levelname == 'WARNING' - assert caplog.records[0].message == message + assert len(caplog.records) == 1 + assert caplog.records[0].name == 'test' + assert caplog.records[0].levelname == 'WARNING' + assert caplog.records[0].message == message expected = { 'collection_source': 'test', @@ -110,22 +116,25 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d expected['collection_note'] = note if directory: expected['local_file_name'] = tmpdir.join('xxx', path) - - with open(tmpdir.join(path), 'rb') as f: - assert mocked.call_count == 1 - assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) - assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} - assert mocked.call_args[1]['data'] == expected - assert len(mocked.call_args[1]) == 3 - - if directory: - assert mocked.call_args[1]['files'] == {} - else: - assert len(mocked.call_args[1]['files']) == 1 - assert len(mocked.call_args[1]['files']['file']) == 3 - assert mocked.call_args[1]['files']['file'][0] == 'file.json' - assert mocked.call_args[1]['files']['file'][1].read() == f.read() - assert mocked.call_args[1]['files']['file'][2] == 'application/json' + if not post_to_api: + assert mocked.call_count == 0 + else: + with open(tmpdir.join(path), 'rb') as f: + assert mocked.call_count == 1 + assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) + assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} + assert mocked.call_args[1]['data'] == expected + if post_to_api: + assert len(mocked.call_args[1]) == 3 + + if directory: + assert mocked.call_args[1]['files'] == {} + else: + assert len(mocked.call_args[1]['files']) == 1 + assert len(mocked.call_args[1]['files']['file']) == 3 + assert mocked.call_args[1]['files']['file'][0] == 'file.json' + assert mocked.call_args[1]['files']['file'][1].read() == f.read() + assert mocked.call_args[1]['files']['file'][2] == 'application/json' @pytest.mark.parametrize('sample,is_sample', [(None, False), ('true', True)]) From d08ff01192797d0ed36d85fa8868e5864e74e7f6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 25 May 2020 17:24:00 -0400 Subject: [PATCH 10/13] Add tests for save_response_to_disk and save_data_to_disk --- tests/test_base_spider.py | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index d44d555fd..daa86698f 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -31,6 +31,46 @@ def test_sample_no_kwarg(): assert spider.sample is False +def test_save_response_to_disk(): + spider = BaseSpider(name='test') + + response = Mock() + response.body = b'{"key": "value"}' + response.request = Mock() + response.request.url = 'https://example.com/remote.json' + + actual = spider.save_response_to_disk(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + + assert actual == { + 'success': True, + 'file_name': 'file.json', + 'data': b'{"key": "value"}', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + 'post_to_api': True, + } + + +def test_save_data_to_disk(): + spider = BaseSpider(name='test') + + data = b'{"key": "value"}' + url = 'https://example.com/remote.json' + + actual = spider.save_data_to_disk(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + + assert actual == { + 'success': True, + 'file_name': 'file.json', + 'data': b'{"key": "value"}', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + 'post_to_api': True, + } + + def test_next_link(): url = 'https://example.com/remote.json' text_response = text.TextResponse('test') From 1808ae2c95f08f6f96b0ff3670ebb35c0bbca40c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 25 May 2020 17:24:34 -0400 Subject: [PATCH 11/13] Use consistent key-order for item dict --- kingfisher_scrapy/base_spider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index cfbebc24e..965456581 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -94,15 +94,15 @@ def save_response_to_disk(self, response, filename, data_type=None, encoding='ut def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True): """ - Returns an item to yield + Returns an item to yield. """ return { - 'data': data, + 'success': True, 'file_name': filename, - 'url': url, + 'data': data, 'data_type': data_type, + 'url': url, 'encoding': encoding, - 'success': True, 'post_to_api': post_to_api, } @@ -121,7 +121,7 @@ def _build_file_item(self, number, line, data_type, url, encoding, file_name): 'data_type': data_type, 'url': url, 'encoding': encoding, - 'post_to_api': True + 'post_to_api': True, } def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): From 7704e9b97910aeb4cad4deedb2e0af2df21fa387 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 25 May 2020 21:37:50 -0400 Subject: [PATCH 12/13] Use clearer method and argument names. Put private methods together. Fix docstrings. Fix code style and consistency. --- kingfisher_scrapy/base_spider.py | 90 ++++++++++++++++++-------------- kingfisher_scrapy/util.py | 16 +++--- 2 files changed, 60 insertions(+), 46 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 965456581..1538e71d1 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -48,9 +48,9 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, # https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments self.sample = sample == 'true' + self.note = note self.from_date = from_date self.until_date = until_date - self.note = note self.date_format = self.VALID_DATE_FORMATS[date_format] spider_arguments = { @@ -69,10 +69,10 @@ def from_crawler(cls, crawler, *args, **kwargs): # Checks Spider date ranges arguments if spider.from_date or spider.until_date: if not spider.from_date: - # 'from_date' defaults to 'default_from_date' spider class attribute + # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date if not spider.until_date: - # 'until_date' defaults to today + # Default to today. spider.until_date = datetime.now().strftime(spider.date_format) try: spider.from_date = datetime.strptime(spider.from_date, spider.date_format) @@ -85,6 +85,12 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider + def get_start_time(self, format): + """ + Returns the formatted start time of the crawl. + """ + return self.crawler.stats.get_value('start_time').strftime(format) + def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True): """ Returns an item to yield, based on the response to a request. @@ -106,24 +112,32 @@ def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding=' 'post_to_api': post_to_api, } - def get_start_time(self, format): - """ - Returns the formatted start time of the crawl. - """ - return self.crawler.stats.get_value('start_time').strftime(format) - - def _build_file_item(self, number, line, data_type, url, encoding, file_name): + def _build_file_item(self, number, data, data_type, url, encoding, file_name): return { 'success': True, 'number': number, 'file_name': file_name, - 'data': line, + 'data': data, 'data_type': data_type, 'url': url, 'encoding': encoding, 'post_to_api': True, } + def _get_package_metadata(self, f, skip_key): + """ + Returns the package metadata from a file object. + + :param f: a file object + :param str skip_key: the key to skip + :returns: the package metadata + :rtype: dict + """ + package = {} + for item in util.items(ijson.parse(f), '', skip_key=skip_key): + package.update(item) + return package + def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: @@ -132,28 +146,19 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data. line = line.decode(encoding=encoding) yield self._build_file_item(number, line, data_type, url, encoding, file_name) - def get_package(self, f, array_name): - """ - Returns the package data from a array_name_package object - """ - package = {} - for item in util.items(ijson.parse(f), '', array_name=array_name): - package.update(item) - return package - - def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', - array_field_name='releases', file_name='data.json'): + def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', + file_name='data.json'): if self.sample: size = self.MAX_SAMPLE else: size = self.MAX_RELEASES_PER_PACKAGE - package = self.get_package(f_package, array_field_name) + package = self._get_package_metadata(f_package, array_field_name) for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) - yield self._build_file_item(number, json.dumps(package, default=util.default), data_type, url, encoding, - file_name) + data = json.dumps(package, default=util.default) + yield self._build_file_item(number, data, data_type, url, encoding, file_name) if self.sample: break @@ -161,28 +166,37 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', class ZipSpider(BaseSpider): def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): """ - Handling response with JSON data in ZIP files - - :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped - file will be yielded separately. If this is set to "release_package", then the releases will be re-packaged - in groups of :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. In - both cases, only the zipped file will be saved to disk. If this is not set, the file will be yielded and - saved to disk. - :param response response: the response that contains the zip file. - :param str data_type: the zipped files data_type - :param str encoding: the zipped files encoding. Default to utf-8 + Handles a response that is a ZIP file. + + :param response response: the response + :param str data_type: the compressed files' ``data_type`` + :param str file_format: The compressed files' format + + ``json_lines`` + Yields each line of the compressed files. + The ZIP file is saved to disk. + ``release_package`` + Re-packages the releases in the compressed files in groups of + :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. + The ZIP file is saved to disk. + ``None`` + Yields each compressed file. + Each compressed file is saved to disk. + :param str encoding: the compressed files' encoding """ if response.status == 200: if file_format: - self.save_response_to_disk(response, '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')) - .hexdigest()), - post_to_api=False) + filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) + self.save_response_to_disk(response, filename, post_to_api=False) + zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): filename = finfo.filename if not filename.endswith('.json'): filename += '.json' + data = zip_file.open(finfo.filename) + if file_format == 'json_lines': yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding, file_name=filename) diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index b78fe0ee7..fce72ac53 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -6,14 +6,14 @@ @utils.coroutine -def items_basecoro(target, prefix, map_type=None, array_name=None): +def items_basecoro(target, prefix, map_type=None, skip_key=None): """ - This is copied from ``ijson/common.py``. An ``array_name`` argument is added. If the ``array_name`` is in the - current path, the current event is skipped. Otherwise, the method is identical. + This is copied from ``ijson/common.py``. A ``skip_key`` argument is added. If the ``skip_key`` is in the current + path, the current event is skipped. Otherwise, the method is identical. """ while True: current, event, value = (yield) - if array_name and array_name in current: + if skip_key and skip_key in current: continue if current == prefix: if event in ('start_map', 'start_array'): @@ -28,13 +28,13 @@ def items_basecoro(target, prefix, map_type=None, array_name=None): target.send(value) -def items(events, prefix, map_type=None, array_name=None): +def items(events, prefix, map_type=None, skip_key=None): """ - This is copied from ``ijson/common.py``. An ``array_name`` argument is added, which is passed as a keyword argument - to :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. + This is copied from ``ijson/common.py``. A ``skip_key`` argument is added, which is passed as a keyword argument to + :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. """ return utils.coros2gen(events, - (items_basecoro, (prefix,), {'map_type': map_type, 'array_name': array_name}) # noqa: E128 + (items_basecoro, (prefix,), {'map_type': map_type, 'skip_key': skip_key}) # noqa: E128 ) From 6f1aceb8baade58495823b90e3b015a8f80909f6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 25 May 2020 21:47:27 -0400 Subject: [PATCH 13/13] Simplify the log formatter using super() --- kingfisher_scrapy/log_formatter.py | 26 ++++++++------------------ kingfisher_scrapy/settings.py | 7 +++---- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py index 0a0b21fa8..9e0b42bd9 100644 --- a/kingfisher_scrapy/log_formatter.py +++ b/kingfisher_scrapy/log_formatter.py @@ -1,22 +1,12 @@ -import logging - from scrapy import logformatter -from twisted.python.failure import Failure -class KingfisherCustomLoggerFormatter(logformatter.LogFormatter): - # from https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped +class KingfisherLogFormatter(logformatter.LogFormatter): + # https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped def scraped(self, item, response, spider): - """Logs a message when an item is scraped by a spider.""" - if isinstance(response, Failure): - src = response.getErrorMessage() - else: - src = response - return { - 'level': logging.DEBUG, - # we dont log the item content just the source - 'msg': "Scraped from %(src)s", - 'args': { - 'src': src - } - } + """ + Omits an item's `data` value from the log message. + """ + item = item.copy() + item.pop('data', None) + return super().scraped(item, response, spider) diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 79ee6847b..fa953acf4 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -67,9 +67,9 @@ # 'scrapy.extensions.telnet.TelnetConsole': None, #} EXTENSIONS = { + # `KingfisherFilesStore` must run before `KingfisherAPI`, because the file needs to be written before the request + # is sent to Kingfisher Process. 'kingfisher_scrapy.extensions.KingfisherFilesStore': 100, - # KingfisherAPI must have lower priority than KingfisherStoreFiles, because the file needs to be written before the - # request is sent to Kingfisher Process. 'kingfisher_scrapy.extensions.KingfisherAPI': 500, } @@ -88,8 +88,7 @@ # instead of files to Kingfisher Process' API. To enable that, set this to the absolute path to the `FILES_STORE`. KINGFISHER_API_LOCAL_DIRECTORY = os.getenv('KINGFISHER_API_LOCAL_DIRECTORY') -# We use a custom formatter to avoid logging a item content -LOG_FORMATTER = 'kingfisher_scrapy.log_formatter.KingfisherCustomLoggerFormatter' +LOG_FORMATTER = 'kingfisher_scrapy.log_formatter.KingfisherLogFormatter' KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN') KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET')