Skip to content

Commit

Permalink
Name the pluck CSV file according to the passed arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Jul 27, 2020
1 parent fe8efd9 commit 7422426
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Expand Up @@ -9,5 +9,5 @@ venv/
/data
/docs/_build
/htmlcov
/pluck.csv
/pluck-*.csv
/pluck_skipped.json
7 changes: 5 additions & 2 deletions kingfisher_scrapy/commands/pluck.py
Expand Up @@ -8,6 +8,8 @@
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import UsageError

from kingfisher_scrapy.util import _pluck_filename

logger = logging.getLogger(__name__)


Expand All @@ -33,8 +35,9 @@ def run(self, args, opts):
# Limit concurrent requests, to download the minimum.
self.settings.set('CONCURRENT_REQUESTS', 1)

if os.path.isfile('pluck.csv'):
os.unlink('pluck.csv')
filename = _pluck_filename(opts)
if os.path.isfile(filename):
os.unlink(filename)

runner = CrawlerProcess(settings=self.settings)

Expand Down
21 changes: 12 additions & 9 deletions kingfisher_scrapy/extensions.py
Expand Up @@ -9,20 +9,20 @@

from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem
from kingfisher_scrapy.kingfisher_process import Client
from kingfisher_scrapy.util import _pluck_filename


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherPluck:
def __init__(self, filename):
self.filename = filename
def __init__(self, directory):
self.directory = directory
self.spiders_seen = set()

@classmethod
def from_crawler(cls, crawler):
path = crawler.settings['KINGFISHER_PLUCK_PATH']
filename = os.path.join(path, 'pluck.csv')
directory = crawler.settings['KINGFISHER_PLUCK_PATH']

extension = cls(filename=filename)
extension = cls(directory=directory)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)

Expand All @@ -33,15 +33,18 @@ def item_scraped(self, item, spider):
return

self.spiders_seen.add(spider.name)
with open(self.filename, 'a+') as output:
output.write(f"{item['value']},{spider.name}\n")

self._write(spider, item['value'])

def spider_closed(self, spider, reason):
if not spider.pluck or spider.name in self.spiders_seen:
return

with open(self.filename, 'a+') as output:
output.write(f"{reason},{spider.name}\n")
self._write(spider, reason)

def _write(self, spider, value):
with open(os.path.join(self.directory, _pluck_filename(spider)), 'a+') as f:
f.write(f'{value},{spider.name}\n')


class KingfisherFilesStore:
Expand Down
9 changes: 9 additions & 0 deletions kingfisher_scrapy/util.py
Expand Up @@ -8,6 +8,15 @@
from ijson import ObjectBuilder, utils


def _pluck_filename(opts):
if opts.package_pointer:
parts = ['pluck', 'package', opts.package_pointer[1:].replace('/', '-')]
else:
parts = ['pluck', 'release', opts.release_pointer[1:].replace('/', '-')]

return f"{'-'.join(parts)}.csv"


def components(start, stop=None):
"""
Returns a function that returns the selected non-empty path components, excluding the ``.json`` extension.
Expand Down
13 changes: 7 additions & 6 deletions tests/extensions/test_kingfisher_pluck.py
@@ -1,4 +1,5 @@
import os
from glob import glob
from tempfile import TemporaryDirectory

from kingfisher_scrapy.extensions import KingfisherPluck
Expand All @@ -17,7 +18,7 @@ def test_disabled():
extension.item_scraped(item, spider)
extension.spider_closed(spider, 'itemcount')

assert not os.path.exists(os.path.join(tmpdirname, 'pluck.csv'))
assert not glob(os.path.join(tmpdirname, 'pluck*.csv'))


def test_item_scraped():
Expand All @@ -30,21 +31,21 @@ def test_item_scraped():

extension.item_scraped(item, spider)

with open(os.path.join(tmpdirname, 'pluck.csv')) as f:
with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
assert '2020-10-01,test\n' == f.read()

# Only one item from the same spider is written.
extension.item_scraped(item, spider)

with open(os.path.join(tmpdirname, 'pluck.csv')) as f:
with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
assert '2020-10-01,test\n' == f.read()

# An item from another spider is appended.
spider.name = 'other'
item['value'] = '2020-10-02'
extension.item_scraped(item, spider)

with open(os.path.join(tmpdirname, 'pluck.csv')) as f:
with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
assert '2020-10-01,test\n2020-10-02,other\n' == f.read()


Expand All @@ -59,7 +60,7 @@ def test_spider_closed_with_items():
extension.item_scraped(item, spider)
extension.spider_closed(spider, 'itemcount')

with open(os.path.join(tmpdirname, 'pluck.csv')) as f:
with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
assert '2020-10-01,test\n' == f.read()


Expand All @@ -72,5 +73,5 @@ def test_spider_closed_without_items():

extension.spider_closed(spider, 'itemcount')

with open(os.path.join(tmpdirname, 'pluck.csv')) as f:
with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
assert 'itemcount,test\n' == f.read()

0 comments on commit 7422426

Please sign in to comment.