diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index 5252ee0d..a41487df 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -66,7 +66,7 @@ Here is a sample: @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') Spider properties ----------------- diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 87f7703c..98cc704d 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -99,20 +99,19 @@ def get_start_time(self, format): """ return self.crawler.stats.get_value('start_time').strftime(format) - def build_file_from_response(self, response, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', - post_to_api=True): + def build_file_from_response(self, response, **kwargs): """ Returns an item to yield, based on the response to a request. """ - if not file_name: - file_name = response.request.meta['kf_filename'] - if not url: - url = response.request.url - if not data: - data = response.body - return self.build_file(data, file_name, url, data_type, encoding, post_to_api) - - def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8', post_to_api=True): + if 'file_name' not in kwargs: + kwargs['file_name'] = response.request.meta['kf_filename'] + if 'url' not in kwargs: + kwargs['url'] = response.request.url + if 'data' not in kwargs: + kwargs['data'] = response.body + return self.build_file(**kwargs) + + def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True): """ Returns an item to yield. """ @@ -125,7 +124,7 @@ def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8' 'post_to_api': post_to_api, }) - def build_file_item(self, number, data, data_type, url, encoding, file_name): + def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'): return FileItem({ 'number': number, 'file_name': file_name, @@ -165,7 +164,8 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data. break if isinstance(line, bytes): line = line.decode(encoding=encoding) - yield self.build_file_item(number, line, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type, + encoding=encoding) def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', file_name='data.json'): @@ -179,7 +179,8 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) data = json.dumps(package, default=util.default) - yield self.build_file_item(number, data, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type, + encoding=encoding) if self.sample: break @@ -236,7 +237,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') """ if file_format: filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) - self.build_file_from_response(response, filename, post_to_api=False) + self.build_file_from_response(response, file_name=filename, post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): @@ -254,8 +255,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') yield from self.parse_json_array(package, data, data_type, response.request.url, encoding=encoding, file_name=filename) else: - yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url, - encoding=encoding) + yield self.build_file(file_name=filename, data=data.read(), url=response.request.url, + data_type=data_type, encoding=encoding) class LinksSpider(BaseSpider): @@ -286,7 +287,7 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type) + yield self.build_file_from_response(response, data_type=self.data_type) if not self.sample: yield self.next_link(response) diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 7962a749..e57e8de7 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -41,8 +41,8 @@ def parse(self, response): with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: for f in tmpfile.infolist(): with tmpfile.open(f) as jsonFile: - yield self.build_file(jsonFile.read(), f.filename, data_type='release_package', - url=response.request.url) + yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(), + data_type='release_package') os.remove(file.name) else: filename = response.request.url.split('/')[-1] diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 1ee7dc69..3179f006 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -164,9 +164,9 @@ def parse(self, response): if all_data: yield self.build_file( - all_data, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', url=response.request.url, + data=all_data, data_type='release_package_list' ) if self.sample: diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index e80295db..61db4438 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -45,7 +45,8 @@ def test_build_file_from_response(): response.request = Mock() response.request.url = 'https://example.com/remote.json' - actual = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', @@ -63,7 +64,8 @@ def test_build_file(): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - actual = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 33200143..cb51cdfc 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -69,7 +69,7 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d kwargs = {} if encoding: kwargs['encoding'] = encoding - item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json', + item = spider.build_file(file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', post_to_api=post_to_api, **kwargs) store_extension.item_scraped(item, spider) @@ -146,12 +146,12 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok if encoding: kwargs['encoding'] = encoding item = spider.build_file_item( - 1, - b'{"key": "value"}', - data_type='release_package', + number=1, + file_name='data.json', url='https://example.com/remote.json', + data=b'{"key": "value"}', + data_type='release_package', encoding=encoding2, - file_name='data.json', ) api_extension.item_scraped(item, spider) @@ -294,7 +294,8 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir): response.request = Mock() response.request.url = 'https://example.com/remote.json' - item = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -322,7 +323,8 @@ def test_item_scraped_with_build_file(sample, path, tmpdir): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - item = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + item = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -349,4 +351,4 @@ def test_build_file_with_existing_directory(): os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. - store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider) + store_extension.item_scraped(spider.build_file(file_name='file.json', data=b'{"key": "value"}'), spider)