diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index ffd6bad1c..1538e71d1 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,6 +1,5 @@ import hashlib import json -import os from datetime import datetime from io import BytesIO from zipfile import ZipFile @@ -49,9 +48,9 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, # https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments self.sample = sample == 'true' + self.note = note self.from_date = from_date self.until_date = until_date - self.note = note self.date_format = self.VALID_DATE_FORMATS[date_format] spider_arguments = { @@ -70,10 +69,10 @@ def from_crawler(cls, crawler, *args, **kwargs): # Checks Spider date ranges arguments if spider.from_date or spider.until_date: if not spider.from_date: - # 'from_date' defaults to 'default_from_date' spider class attribute + # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date if not spider.until_date: - # 'until_date' defaults to today + # Default to today. spider.until_date = datetime.now().strftime(spider.date_format) try: spider.from_date = datetime.strptime(spider.from_date, spider.date_format) @@ -86,116 +85,80 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider - def get_local_file_path_including_filestore(self, filename): - """ - Prepends Scrapy's storage directory and the crawl's relative directory to the filename. - """ - return os.path.join(self.crawler.settings['FILES_STORE'], self._get_crawl_path(), filename) - - def get_local_file_path_excluding_filestore(self, filename): - """ - Prepends the crawl's relative directory to the filename. + def get_start_time(self, format): """ - return os.path.join(self._get_crawl_path(), filename) - - def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8'): + Returns the formatted start time of the crawl. """ - Writes the response's body to the filename in the crawl's directory. + return self.crawler.stats.get_value('start_time').strftime(format) - Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. + def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True): """ - return self._save_response_to_disk(response.body, filename, response.request.url, data_type, encoding) - - def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8'): + Returns an item to yield, based on the response to a request. """ - Writes the data to the filename in the crawl's directory. + return self.save_data_to_disk(response.body, filename, response.request.url, data_type, encoding, + post_to_api) - Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. + def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True): """ - return self._save_response_to_disk(data, filename, url, data_type, encoding) - - def get_start_time(self, format): + Returns an item to yield. """ - Returns the formatted start time of the crawl. - """ - return self.crawler.stats.get_value('start_time').strftime(format) - - def _save_response_to_disk(self, data, filename, url, data_type, encoding): - self._write_file(filename, data) - - metadata = { - 'url': url, + return { + 'success': True, + 'file_name': filename, + 'data': data, 'data_type': data_type, + 'url': url, 'encoding': encoding, + 'post_to_api': post_to_api, } - self._write_file(filename + '.fileinfo', metadata) - - metadata['success'] = True - metadata['file_name'] = filename - - return metadata - - def _write_file(self, filename, data): - path = self.get_local_file_path_including_filestore(filename) - os.makedirs(os.path.dirname(path), exist_ok=True) - - if isinstance(data, bytes): - mode = 'wb' - else: - mode = 'w' - - with open(path, mode) as f: - if isinstance(data, (bytes, str)): - f.write(data) - else: - json.dump(data, f) - - def _get_crawl_path(self): - name = self.name - if self.sample: - name += '_sample' - return os.path.join(name, self.get_start_time('%Y%m%d_%H%M%S')) - - def _build_file_item(self, number, line, data_type, url, encoding): + def _build_file_item(self, number, data, data_type, url, encoding, file_name): return { 'success': True, 'number': number, - 'file_name': 'data.json', - 'data': line, + 'file_name': file_name, + 'data': data, 'data_type': data_type, 'url': url, 'encoding': encoding, + 'post_to_api': True, } - def parse_json_lines(self, f, data_type, url, encoding='utf-8'): - for number, line in enumerate(f, 1): - if self.sample and number > self.MAX_SAMPLE: - break - if isinstance(line, bytes): - line = line.decode(encoding=encoding) - yield self._build_file_item(number, line, data_type, url, encoding) - - def get_package(self, f, array_name): + def _get_package_metadata(self, f, skip_key): """ - Returns the package data from a array_name_package object + Returns the package metadata from a file object. + + :param f: a file object + :param str skip_key: the key to skip + :returns: the package metadata + :rtype: dict """ package = {} - for item in util.items(ijson.parse(f), '', array_name=array_name): + for item in util.items(ijson.parse(f), '', skip_key=skip_key): package.update(item) return package - def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases'): + def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): + for number, line in enumerate(f, 1): + if self.sample and number > self.MAX_SAMPLE: + break + if isinstance(line, bytes): + line = line.decode(encoding=encoding) + yield self._build_file_item(number, line, data_type, url, encoding, file_name) + + def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', + file_name='data.json'): if self.sample: size = self.MAX_SAMPLE else: size = self.MAX_RELEASES_PER_PACKAGE - package = self.get_package(f_package, array_field_name) + package = self._get_package_metadata(f_package, array_field_name) for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) - yield self._build_file_item(number, json.dumps(package, default=util.default), data_type, url, encoding) + data = json.dumps(package, default=util.default) + yield self._build_file_item(number, data, data_type, url, encoding, file_name) if self.sample: break @@ -203,33 +166,44 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', class ZipSpider(BaseSpider): def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): """ - Handling response with JSON data in ZIP files - - :param str file_format: The zipped file's format. If this is set to "json_lines", then each line of the zipped - file will be yielded separately. If this is set to "release_package", then the releases will be re-packaged - in groups of :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE` and yielded. In - both cases, only the zipped file will be saved to disk. If this is not set, the file will be yielded and - saved to disk. - :param response response: the response that contains the zip file. - :param str data_type: the zipped files data_type - :param str encoding: the zipped files encoding. Default to utf-8 + Handles a response that is a ZIP file. + + :param response response: the response + :param str data_type: the compressed files' ``data_type`` + :param str file_format: The compressed files' format + + ``json_lines`` + Yields each line of the compressed files. + The ZIP file is saved to disk. + ``release_package`` + Re-packages the releases in the compressed files in groups of + :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. + The ZIP file is saved to disk. + ``None`` + Yields each compressed file. + Each compressed file is saved to disk. + :param str encoding: the compressed files' encoding """ if response.status == 200: if file_format: - self.save_response_to_disk(response, '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')) - .hexdigest())) + filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) + self.save_response_to_disk(response, filename, post_to_api=False) + zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): filename = finfo.filename if not filename.endswith('.json'): filename += '.json' + data = zip_file.open(finfo.filename) + if file_format == 'json_lines': - yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding) + yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding, + file_name=filename) elif file_format == 'release_package': package = zip_file.open(finfo.filename) yield from self.parse_json_array(package, data, data_type, response.request.url, - encoding=encoding) + encoding=encoding, file_name=filename) else: yield self.save_data_to_disk(data.read(), filename, data_type=data_type, url=response.request.url, encoding=encoding) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 93aa97f0e..f730ecde9 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -8,6 +8,70 @@ # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension +class KingfisherFilesStore: + def __init__(self, directory): + self.directory = directory + + @classmethod + def from_crawler(cls, crawler): + directory = crawler.settings['FILES_STORE'] + extension = cls(directory) + crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) + return extension + + def item_scraped(self, item, spider): + """ + Writes the item's data to the filename in the crawl's directory. + + Writes a ``.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata. + """ + if 'number' not in item: + self._write_file(item['file_name'], item['data'], spider) + metadata = { + 'url': item['url'], + 'data_type': item['data_type'], + 'encoding': item['encoding'], + } + self._write_file(item['file_name'] + '.fileinfo', metadata, spider) + item['path_including_file_store'] = self.get_local_file_path_including_filestore(item['file_name'], + spider) + item['path_excluding_file_store'] = self.get_local_file_path_excluding_filestore(item['file_name'], + spider) + + def _write_file(self, filename, data, spider): + path = self.get_local_file_path_including_filestore(filename, spider) + os.makedirs(os.path.dirname(path), exist_ok=True) + + if isinstance(data, bytes): + mode = 'wb' + else: + mode = 'w' + + with open(path, mode) as f: + if isinstance(data, (bytes, str)): + f.write(data) + else: + json.dump(data, f) + + def get_local_file_path_including_filestore(self, filename, spider): + """ + Prepends Scrapy's storage directory and the crawl's relative directory to the filename. + """ + return os.path.join(self.directory, self._get_crawl_path(spider), filename) + + def get_local_file_path_excluding_filestore(self, filename, spider): + """ + Prepends the crawl's relative directory to the filename. + """ + return os.path.join(self._get_crawl_path(spider), filename) + + def _get_crawl_path(self, spider): + name = spider.name + if spider.sample: + name += '_sample' + return os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S')) + + class KingfisherAPI: def __init__(self, url, key, directory=None): """ @@ -54,6 +118,9 @@ def item_scraped(self, item, spider): If the Scrapy item indicates success, sends a Kingfisher Process API request to create either a Kingfisher Process file or file item. Otherwise, sends an API request to create a file error. """ + if not item.get('post_to_api', True): + return + data = { 'collection_source': spider.name, 'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'), @@ -78,11 +145,11 @@ def item_scraped(self, item, spider): # File else: if self.directory: - path = spider.get_local_file_path_excluding_filestore(item['file_name']) + path = item['path_excluding_file_store'] data['local_file_name'] = os.path.join(self.directory, path) files = {} else: - path = spider.get_local_file_path_including_filestore(item['file_name']) + path = item['path_including_file_store'] f = open(path, 'rb') files = {'file': (item['file_name'], f, 'application/json')} diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py new file mode 100644 index 000000000..9e0b42bd9 --- /dev/null +++ b/kingfisher_scrapy/log_formatter.py @@ -0,0 +1,12 @@ +from scrapy import logformatter + + +class KingfisherLogFormatter(logformatter.LogFormatter): + # https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped + def scraped(self, item, response, spider): + """ + Omits an item's `data` value from the log message. + """ + item = item.copy() + item.pop('data', None) + return super().scraped(item, response, spider) diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 6c0a246ee..fa953acf4 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -67,7 +67,10 @@ # 'scrapy.extensions.telnet.TelnetConsole': None, #} EXTENSIONS = { - 'kingfisher_scrapy.extensions.KingfisherAPI': 0, + # `KingfisherFilesStore` must run before `KingfisherAPI`, because the file needs to be written before the request + # is sent to Kingfisher Process. + 'kingfisher_scrapy.extensions.KingfisherFilesStore': 100, + 'kingfisher_scrapy.extensions.KingfisherAPI': 500, } # Configure item pipelines @@ -85,6 +88,8 @@ # instead of files to Kingfisher Process' API. To enable that, set this to the absolute path to the `FILES_STORE`. KINGFISHER_API_LOCAL_DIRECTORY = os.getenv('KINGFISHER_API_LOCAL_DIRECTORY') +LOG_FORMATTER = 'kingfisher_scrapy.log_formatter.KingfisherLogFormatter' + KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN') KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET') diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index 8141f1e67..574881599 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -1,6 +1,5 @@ -import json -import os import tarfile +from io import BytesIO from kingfisher_scrapy.base_spider import BaseSpider @@ -10,24 +9,10 @@ class DigiwhistBase(BaseSpider): def parse(self, response): if response.status == 200: - save_file_name = self.get_local_file_path_including_filestore('file.tar.gz') - - # Create folder for data - os.makedirs(os.path.dirname(save_file_name), exist_ok=True) - - # Save original file - with open(save_file_name, "wb") as fp: - fp.write(response.body) - - # Save some extra info alongside that file - with open(save_file_name + '.fileinfo', 'w') as f: - f.write(json.dumps({ - 'url': response.request.url, - 'data_type': 'release_package_json_lines', - })) + yield self.save_response_to_disk(response, 'file.tar.gz', post_to_api=False) # Load a line at the time, pass it to API - with tarfile.open(save_file_name, "r:gz") as tar: + with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: with tar.extractfile(tar.getnames()[0]) as readfp: yield from self.parse_json_lines(readfp, 'release_package', self.start_urls[0]) else: diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index ed495cb9f..c33634727 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -1,11 +1,9 @@ -from zipfile import ZipFile - import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import ZipSpider -class GeorgiaOpenData(BaseSpider): +class GeorgiaOpenData(ZipSpider): name = 'georgia_opendata' custom_settings = { # This has to download a 400MB file so ..... @@ -14,32 +12,12 @@ class GeorgiaOpenData(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://opendata.spa.ge/json/allTenders.zip', - callback=self.parse_zip + url='http://opendata.spa.ge/json/allTenders.zip' ) - def parse_zip(self, response): + def parse(self, response): if response.status == 200: - - # Save original file - save_file_name = self.get_local_file_path_including_filestore('allTenders.zip') - with open(save_file_name, "wb") as fp: - fp.write(response.body) - - # Now extract each file one at a time, save to disk and pass to pipelines for processing - zip_file = ZipFile(save_file_name) - for finfo in zip_file.infolist(): - if finfo.filename.endswith('.json'): - data = zip_file.open(finfo.filename).read() - yield self.save_data_to_disk( - data, - finfo.filename.replace('/', '-'), - data_type='release_package', - url=response.request.url - ) - if self.sample: - return - + yield from self.parse_zipfile(response, 'release_package', file_format='release_package') else: yield { 'success': False, diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index b78fe0ee7..fce72ac53 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -6,14 +6,14 @@ @utils.coroutine -def items_basecoro(target, prefix, map_type=None, array_name=None): +def items_basecoro(target, prefix, map_type=None, skip_key=None): """ - This is copied from ``ijson/common.py``. An ``array_name`` argument is added. If the ``array_name`` is in the - current path, the current event is skipped. Otherwise, the method is identical. + This is copied from ``ijson/common.py``. A ``skip_key`` argument is added. If the ``skip_key`` is in the current + path, the current event is skipped. Otherwise, the method is identical. """ while True: current, event, value = (yield) - if array_name and array_name in current: + if skip_key and skip_key in current: continue if current == prefix: if event in ('start_map', 'start_array'): @@ -28,13 +28,13 @@ def items_basecoro(target, prefix, map_type=None, array_name=None): target.send(value) -def items(events, prefix, map_type=None, array_name=None): +def items(events, prefix, map_type=None, skip_key=None): """ - This is copied from ``ijson/common.py``. An ``array_name`` argument is added, which is passed as a keyword argument - to :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. + This is copied from ``ijson/common.py``. A ``skip_key`` argument is added, which is passed as a keyword argument to + :meth:`~kingfisher_scrapy.util.items_basecoro`. Otherwise, the method is identical. """ return utils.coros2gen(events, - (items_basecoro, (prefix,), {'map_type': map_type, 'array_name': array_name}) # noqa: E128 + (items_basecoro, (prefix,), {'map_type': map_type, 'skip_key': skip_key}) # noqa: E128 ) diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index dbfdab5d8..daa86698f 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -31,110 +31,44 @@ def test_sample_no_kwarg(): assert spider.sample is False -@pytest.mark.parametrize('sample,expected', [ - (None, 'data/test/20010203_040506/file.json'), - ('true', 'data/test_sample/20010203_040506/file.json'), -]) -def test_get_local_file_path_including_filestore(sample, expected): - spider = spider_with_crawler(sample=sample) - spider.crawler.settings['FILES_STORE'] = 'data' - - assert spider.get_local_file_path_including_filestore('file.json') == expected - - -@pytest.mark.parametrize('sample,expected', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_get_local_file_path_excluding_filestore(sample, expected): - spider = spider_with_crawler(sample=sample) - - assert spider.get_local_file_path_excluding_filestore('file.json') == expected - - -@pytest.mark.parametrize('sample,path', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_save_response_to_disk(sample, path): - spider = spider_with_crawler(sample=sample) - - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - - response = Mock() - response.body = b'{"key": "value"}' - response.request = Mock() - response.request.url = 'https://example.com/remote.json' - - actual = spider.save_response_to_disk(response, 'file.json', data_type='release_package', - encoding='iso-8859-1') - - with open(os.path.join(files_store, path)) as f: - assert f.read() == '{"key": "value"}' - - with open(os.path.join(files_store, path + '.fileinfo')) as f: - assert json.load(f) == { - 'url': 'https://example.com/remote.json', - 'data_type': 'release_package', - 'encoding': 'iso-8859-1', - } - - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } - - -@pytest.mark.parametrize('sample,path', [ - (None, 'test/20010203_040506/file.json'), - ('true', 'test_sample/20010203_040506/file.json'), -]) -def test_save_data_to_disk(sample, path): - spider = spider_with_crawler(sample=sample) - - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - - data = b'{"key": "value"}' - url = 'https://example.com/remote.json' - - actual = spider.save_data_to_disk(data, 'file.json', url=url, data_type='release_package', - encoding='iso-8859-1') +def test_save_response_to_disk(): + spider = BaseSpider(name='test') - with open(os.path.join(files_store, path)) as f: - assert f.read() == '{"key": "value"}' + response = Mock() + response.body = b'{"key": "value"}' + response.request = Mock() + response.request.url = 'https://example.com/remote.json' - with open(os.path.join(files_store, path + '.fileinfo')) as f: - assert json.load(f) == { - 'url': 'https://example.com/remote.json', - 'data_type': 'release_package', - 'encoding': 'iso-8859-1', - } + actual = spider.save_response_to_disk(response, 'file.json', data_type='release_package', encoding='iso-8859-1') - assert actual == { - 'success': True, - 'file_name': 'file.json', - "data_type": 'release_package', - "url": 'https://example.com/remote.json', - 'encoding': 'iso-8859-1', - } + assert actual == { + 'success': True, + 'file_name': 'file.json', + 'data': b'{"key": "value"}', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + 'post_to_api': True, + } -def test_save_data_to_disk_with_existing_directory(): - spider = spider_with_crawler() +def test_save_data_to_disk(): + spider = BaseSpider(name='test') - with TemporaryDirectory() as tmpdirname: - files_store = os.path.join(tmpdirname, 'data') - spider.crawler.settings['FILES_STORE'] = files_store - os.makedirs(os.path.join(files_store, 'test/20010203_040506')) + data = b'{"key": "value"}' + url = 'https://example.com/remote.json' - spider.save_data_to_disk(b'{"key": "value"}', 'file.json') # no FileExistsError exception + actual = spider.save_data_to_disk(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + + assert actual == { + 'success': True, + 'file_name': 'file.json', + 'data': b'{"key": "value"}', + "data_type": 'release_package', + "url": 'https://example.com/remote.json', + 'encoding': 'iso-8859-1', + 'post_to_api': True, + } def test_next_link(): diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 41cbce848..b01673b84 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -1,9 +1,12 @@ +import json +import os +from tempfile import TemporaryDirectory from unittest.mock import Mock, patch import pytest from scrapy.exceptions import NotConfigured -from kingfisher_scrapy.extensions import KingfisherAPI +from kingfisher_scrapy.extensions import KingfisherAPI, KingfisherFilesStore from tests import spider_with_crawler @@ -51,14 +54,20 @@ def test_from_crawler_missing_arguments(api_url, api_key): @pytest.mark.parametrize('encoding,encoding2', [(None, 'utf-8'), ('iso-8859-1', 'iso-8859-1')]) @pytest.mark.parametrize('directory', [False, True]) @pytest.mark.parametrize('ok', [True, False]) -def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, tmpdir, caplog): +@pytest.mark.parametrize('post_to_api', [True, True, False]) +def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, tmpdir, caplog, + post_to_api): spider = spider_after_open(tmpdir, sample=sample, note=note) if directory: spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx')) + spider.crawler.settings['FILES_STORE'] = tmpdir + + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) extension = KingfisherAPI.from_crawler(spider.crawler) - spider.save_data_to_disk(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json') + extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json', + url='https://example.com/remote.json'), spider) with patch('requests.post') as mocked: response = Mock() @@ -66,25 +75,32 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d response.status_code = 400 mocked.return_value = response - data = { - 'success': True, - 'file_name': 'file.json', - 'url': 'https://example.com/remote.json', + data = spider.save_data_to_disk( + data=None, + filename='file.json', + url='https://example.com/remote.json', # Specific to this test case. - 'data_type': 'release_package', - } - if encoding: - data['encoding'] = encoding + data_type='release_package', + encoding=encoding2, + post_to_api=post_to_api + ) + data['path_excluding_file_store'] = os.path.join(tmpdir, path) + data['path_including_file_store'] = os.path.join(tmpdir, path) + if directory: + data['path_excluding_file_store'] = tmpdir.join('xxx', path) extension.item_scraped(data, spider) if not ok: - message = 'Failed to post [https://example.com/remote.json]. API status code: 400' + if not post_to_api: + assert len(caplog.records) == 0 + else: + message = 'Failed to post [https://example.com/remote.json]. API status code: 400' - assert len(caplog.records) == 1 - assert caplog.records[0].name == 'test' - assert caplog.records[0].levelname == 'WARNING' - assert caplog.records[0].message == message + assert len(caplog.records) == 1 + assert caplog.records[0].name == 'test' + assert caplog.records[0].levelname == 'WARNING' + assert caplog.records[0].message == message expected = { 'collection_source': 'test', @@ -100,22 +116,25 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d expected['collection_note'] = note if directory: expected['local_file_name'] = tmpdir.join('xxx', path) - - with open(tmpdir.join(path), 'rb') as f: - assert mocked.call_count == 1 - assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) - assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} - assert mocked.call_args[1]['data'] == expected - assert len(mocked.call_args[1]) == 3 - - if directory: - assert mocked.call_args[1]['files'] == {} - else: - assert len(mocked.call_args[1]['files']) == 1 - assert len(mocked.call_args[1]['files']['file']) == 3 - assert mocked.call_args[1]['files']['file'][0] == 'file.json' - assert mocked.call_args[1]['files']['file'][1].read() == f.read() - assert mocked.call_args[1]['files']['file'][2] == 'application/json' + if not post_to_api: + assert mocked.call_count == 0 + else: + with open(tmpdir.join(path), 'rb') as f: + assert mocked.call_count == 1 + assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) + assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} + assert mocked.call_args[1]['data'] == expected + if post_to_api: + assert len(mocked.call_args[1]) == 3 + + if directory: + assert mocked.call_args[1]['files'] == {} + else: + assert len(mocked.call_args[1]['files']) == 1 + assert len(mocked.call_args[1]['files']['file']) == 3 + assert mocked.call_args[1]['files']['file'][0] == 'file.json' + assert mocked.call_args[1]['files']['file'][1].read() == f.read() + assert mocked.call_args[1]['files']['file'][2] == 'application/json' @pytest.mark.parametrize('sample,is_sample', [(None, False), ('true', True)]) @@ -133,18 +152,15 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok response.status_code = 400 mocked.return_value = response - data = { - 'success': True, - 'file_name': 'file.json', - 'url': 'https://example.com/remote.json', + data = spider._build_file_item( + 1, + b'{"key": "value"}', + url='https://example.com/remote.json', # Specific to this test case. - 'data_type': 'release_package', - 'number': 1, - 'data': b'{"key": "value"}', - } - if encoding: - data['encoding'] = encoding - + data_type='release_package', + encoding=encoding2, + file_name='data.json', + ) extension.item_scraped(data, spider) if not ok: @@ -159,13 +175,13 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, - 'file_name': 'file.json', + 'file_name': 'data.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'data_type': 'release_package', 'encoding': encoding2, 'number': 1, - 'data': b'{"key": "value"}', + 'data': b'{"key": "value"}' } if note: expected['collection_note'] = note @@ -272,3 +288,95 @@ def test_spider_closed_other_reason(tmpdir): extension.spider_closed(spider, 'xxx') mocked.assert_not_called() + + +@pytest.mark.parametrize('sample,path', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_save_response_to_disk(sample, path, tmpdir): + spider = spider_after_open(tmpdir, sample=sample) + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + response = Mock() + response.body = b'{"key": "value"}' + response.request = Mock() + response.request.url = 'https://example.com/remote.json' + + extension_store.item_scraped(spider.save_response_to_disk(response, 'file.json', + data_type='release_package', + encoding='iso-8859-1'), spider) + + with open(os.path.join(files_store, path)) as f: + assert f.read() == '{"key": "value"}' + + with open(os.path.join(files_store, path + '.fileinfo')) as f: + assert json.load(f) == { + 'url': 'https://example.com/remote.json', + 'data_type': 'release_package', + 'encoding': 'iso-8859-1', + } + + +@pytest.mark.parametrize('sample,path', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_save_data_to_disk(sample, path): + spider = spider_with_crawler(sample=sample) + + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + + data = b'{"key": "value"}' + url = 'https://example.com/remote.json' + + extension_store.item_scraped(spider.save_data_to_disk(data, 'file.json', url=url, + data_type='release_package', + encoding='iso-8859-1'), spider) + + with open(os.path.join(files_store, path)) as f: + assert f.read() == '{"key": "value"}' + + with open(os.path.join(files_store, path + '.fileinfo')) as f: + assert json.load(f) == { + 'url': 'https://example.com/remote.json', + 'data_type': 'release_package', + 'encoding': 'iso-8859-1', + } + + +def test_save_data_to_disk_with_existing_directory(): + spider = spider_with_crawler() + with TemporaryDirectory() as tmpdirname: + files_store = os.path.join(tmpdirname, 'data') + spider.crawler.settings['FILES_STORE'] = files_store + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + os.makedirs(os.path.join(files_store, 'test/20010203_040506')) + extension_store.item_scraped(spider.save_data_to_disk(b'{"key": "value"}', 'file.json'), + spider) # no FileExistsError exception + + +@pytest.mark.parametrize('sample,expected', [ + (None, 'data/test/20010203_040506/file.json'), + ('true', 'data/test_sample/20010203_040506/file.json'), +]) +def test_get_local_file_path_including_filestore(sample, expected): + spider = spider_with_crawler(sample=sample) + spider.crawler.settings['FILES_STORE'] = 'data' + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + assert extension_store.get_local_file_path_including_filestore('file.json', spider) == expected + + +@pytest.mark.parametrize('sample,expected', [ + (None, 'test/20010203_040506/file.json'), + ('true', 'test_sample/20010203_040506/file.json'), +]) +def test_get_local_file_path_excluding_filestore(sample, expected): + spider = spider_with_crawler(sample=sample) + extension_store = KingfisherFilesStore.from_crawler(spider.crawler) + assert extension_store.get_local_file_path_excluding_filestore('file.json', spider) == expected