diff --git a/.gitignore b/.gitignore index dd1f366a..dea9f7a8 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,5 @@ venv/ /data /docs/_build /htmlcov -/pluck.csv +/pluck-*.csv /pluck_skipped.json diff --git a/kingfisher_scrapy/commands/pluck.py b/kingfisher_scrapy/commands/pluck.py index c1543335..69097b1e 100644 --- a/kingfisher_scrapy/commands/pluck.py +++ b/kingfisher_scrapy/commands/pluck.py @@ -8,6 +8,8 @@ from scrapy.crawler import CrawlerProcess from scrapy.exceptions import UsageError +from kingfisher_scrapy.util import _pluck_filename + logger = logging.getLogger(__name__) @@ -33,8 +35,9 @@ def run(self, args, opts): # Limit concurrent requests, to download the minimum. self.settings.set('CONCURRENT_REQUESTS', 1) - if os.path.isfile('pluck.csv'): - os.unlink('pluck.csv') + filename = _pluck_filename(opts) + if os.path.isfile(filename): + os.unlink(filename) runner = CrawlerProcess(settings=self.settings) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index ffd322dd..3a301c7c 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -9,20 +9,20 @@ from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem from kingfisher_scrapy.kingfisher_process import Client +from kingfisher_scrapy.util import _pluck_filename # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension class KingfisherPluck: - def __init__(self, filename): - self.filename = filename + def __init__(self, directory): + self.directory = directory self.spiders_seen = set() @classmethod def from_crawler(cls, crawler): - path = crawler.settings['KINGFISHER_PLUCK_PATH'] - filename = os.path.join(path, 'pluck.csv') + directory = crawler.settings['KINGFISHER_PLUCK_PATH'] - extension = cls(filename=filename) + extension = cls(directory=directory) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed) @@ -33,15 +33,18 @@ def item_scraped(self, item, spider): return self.spiders_seen.add(spider.name) - with open(self.filename, 'a+') as output: - output.write(f"{item['value']},{spider.name}\n") + + self._write(spider, item['value']) def spider_closed(self, spider, reason): if not spider.pluck or spider.name in self.spiders_seen: return - with open(self.filename, 'a+') as output: - output.write(f"{reason},{spider.name}\n") + self._write(spider, reason) + + def _write(self, spider, value): + with open(os.path.join(self.directory, _pluck_filename(spider)), 'a+') as f: + f.write(f'{value},{spider.name}\n') class KingfisherFilesStore: diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index 255f861e..5bbcd25d 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -8,6 +8,15 @@ from ijson import ObjectBuilder, utils +def _pluck_filename(opts): + if opts.package_pointer: + parts = ['pluck', 'package', opts.package_pointer[1:].replace('/', '-')] + else: + parts = ['pluck', 'release', opts.release_pointer[1:].replace('/', '-')] + + return f"{'-'.join(parts)}.csv" + + def components(start, stop=None): """ Returns a function that returns the selected non-empty path components, excluding the ``.json`` extension. diff --git a/tests/extensions/test_kingfisher_pluck.py b/tests/extensions/test_kingfisher_pluck.py index 84744364..75436c13 100644 --- a/tests/extensions/test_kingfisher_pluck.py +++ b/tests/extensions/test_kingfisher_pluck.py @@ -1,4 +1,5 @@ import os +from glob import glob from tempfile import TemporaryDirectory from kingfisher_scrapy.extensions import KingfisherPluck @@ -17,7 +18,7 @@ def test_disabled(): extension.item_scraped(item, spider) extension.spider_closed(spider, 'itemcount') - assert not os.path.exists(os.path.join(tmpdirname, 'pluck.csv')) + assert not glob(os.path.join(tmpdirname, 'pluck*.csv')) def test_item_scraped(): @@ -30,13 +31,13 @@ def test_item_scraped(): extension.item_scraped(item, spider) - with open(os.path.join(tmpdirname, 'pluck.csv')) as f: + with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert '2020-10-01,test\n' == f.read() # Only one item from the same spider is written. extension.item_scraped(item, spider) - with open(os.path.join(tmpdirname, 'pluck.csv')) as f: + with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert '2020-10-01,test\n' == f.read() # An item from another spider is appended. @@ -44,7 +45,7 @@ def test_item_scraped(): item['value'] = '2020-10-02' extension.item_scraped(item, spider) - with open(os.path.join(tmpdirname, 'pluck.csv')) as f: + with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert '2020-10-01,test\n2020-10-02,other\n' == f.read() @@ -59,7 +60,7 @@ def test_spider_closed_with_items(): extension.item_scraped(item, spider) extension.spider_closed(spider, 'itemcount') - with open(os.path.join(tmpdirname, 'pluck.csv')) as f: + with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert '2020-10-01,test\n' == f.read() @@ -72,5 +73,5 @@ def test_spider_closed_without_items(): extension.spider_closed(spider, 'itemcount') - with open(os.path.join(tmpdirname, 'pluck.csv')) as f: + with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert 'itemcount,test\n' == f.read()