Skip to content

Commit

Permalink
Merge fe8efd9 into f010dfb
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Jul 26, 2020
2 parents f010dfb + fe8efd9 commit 1e27d60
Show file tree
Hide file tree
Showing 24 changed files with 406 additions and 264 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ venv/
/data
/docs/_build
/htmlcov
/latestreleasedate.csv
/latestreleasedate_skipped.json
/pluck.csv
/pluck_skipped.json
29 changes: 19 additions & 10 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,28 +63,36 @@ class BaseSpider(scrapy.Spider):
ocds_version = '1.1'
date_format = 'date'

def __init__(self, sample=None, note=None, from_date=None, until_date=None, latest=None,
crawl_time=None, keep_collection_open=None, *args, **kwargs):
def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
**kwargs):
super().__init__(*args, **kwargs)

# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
self.sample = sample == 'true'
self.note = note
self.from_date = from_date
self.until_date = until_date
self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.latest = latest == 'true'
self.crawl_time = crawl_time
self.keep_collection_open = keep_collection_open == 'true'
# Pluck-related arguments.
self.package_pointer = package_pointer
self.release_pointer = release_pointer
self.truncate = int(truncate) if truncate else None

self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.pluck = bool(package_pointer or release_pointer)

spider_arguments = {
'sample': sample,
'note': note,
'from_date': from_date,
'until_date': until_date,
'latest': latest,
'crawl_time': crawl_time,
'keep_collection_open': keep_collection_open,
'package_pointer': package_pointer,
'release_pointer': release_pointer,
'truncate': truncate,
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand All @@ -93,13 +101,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, late
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)

if spider.package_pointer and spider.release_pointer:
raise SpiderArgumentError('You cannot specify both package_pointer and release_pointer spider arguments.')

if spider.crawl_time:
try:
spider.crawl_time = datetime.strptime(spider.crawl_time,
'%Y-%m-%dT%H:%M:%S')
spider.crawl_time = datetime.strptime(spider.crawl_time, '%Y-%m-%dT%H:%M:%S')
except ValueError as e:
raise SpiderArgumentError('spider argument crawl_time: '
'invalid date value: {}'.format(e))
raise SpiderArgumentError('spider argument crawl_time: invalid date value: {}'.format(e))

# Checks Spider date ranges arguments
if spider.from_date or spider.until_date:
Expand Down Expand Up @@ -340,7 +349,7 @@ def start_requests(self):
"""

encoding = 'utf-8'
skip_latest_release_date = "This command doesn't yet support identifying the latest release in a archive file."
skip_pluck = 'Archive files are not supported'
compressed_file_format = None
archive_format = 'zip'
file_name_must_contain = ''
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/commands/dryrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def run(self, args, opts):
CompressedFileSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Disable custom and Telnet extensions.
Expand Down
41 changes: 0 additions & 41 deletions kingfisher_scrapy/commands/latestreleasedate.py

This file was deleted.

58 changes: 58 additions & 0 deletions kingfisher_scrapy/commands/pluck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
import logging
import os
from collections import defaultdict
from datetime import datetime

from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import UsageError

logger = logging.getLogger(__name__)


class Pluck(ScrapyCommand):
def short_desc(self):
return 'Pluck one data value per publisher'

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-p', '--package-pointer', help='The JSON Pointer to the value in the package')
parser.add_option('-r', '--release-pointer', help='The JSON Pointer to the value in the release')
parser.add_option('-t', '--truncate', type=int, help='Truncate the value to this number of characters')

def run(self, args, opts):
if not (bool(opts.package_pointer) ^ bool(opts.release_pointer)):
raise UsageError('Exactly one of --package-pointer or --release-pointer must be set.')

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Limit concurrent requests, to download the minimum.
self.settings.set('CONCURRENT_REQUESTS', 1)

if os.path.isfile('pluck.csv'):
os.unlink('pluck.csv')

runner = CrawlerProcess(settings=self.settings)

year = datetime.today().year
skipped = defaultdict(list)
running = []
for spider_name in runner.spider_loader.list():
if spider_name != 'test_fail':
spidercls = runner.spider_loader.load(spider_name)
if hasattr(spidercls, 'skip_pluck'):
skipped[spidercls.skip_pluck].append(spider_name)
else:
running.append(spider_name)
runner.crawl(spidercls, year=year, package_pointer=opts.package_pointer,
release_pointer=opts.release_pointer, truncate=opts.truncate)

with open('pluck_skipped.json', 'w') as f:
json.dump(skipped, f, indent=2)

logger.info(f"Running {len(running)} spiders: {', '.join(sorted(running))}")
runner.start()
18 changes: 9 additions & 9 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError, FileItem, LatestReleaseDateItem
from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem
from kingfisher_scrapy.kingfisher_process import Client


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherLatestDate:
class KingfisherPluck:
def __init__(self, filename):
self.filename = filename
self.spiders_seen = set()

@classmethod
def from_crawler(cls, crawler):
path = crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_PATH']
filename = os.path.join(path, 'latestreleasedate.csv')
path = crawler.settings['KINGFISHER_PLUCK_PATH']
filename = os.path.join(path, 'pluck.csv')

extension = cls(filename=filename)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
Expand All @@ -29,15 +29,15 @@ def from_crawler(cls, crawler):
return extension

def item_scraped(self, item, spider):
if not spider.latest or spider.name in self.spiders_seen or not isinstance(item, LatestReleaseDateItem):
if not spider.pluck or spider.name in self.spiders_seen or not isinstance(item, PluckedItem):
return

self.spiders_seen.add(spider.name)
with open(self.filename, 'a+') as output:
output.write(f"{item['date']},{spider.name}\n")
output.write(f"{item['value']},{spider.name}\n")

def spider_closed(self, spider, reason):
if not spider.latest or spider.name in self.spiders_seen:
if not spider.pluck or spider.name in self.spiders_seen:
return

with open(self.filename, 'a+') as output:
Expand Down Expand Up @@ -128,7 +128,7 @@ def spider_closed(self, spider, reason):
Sends an API request to end the collection's store step.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
if reason != 'finished' or spider.latest or spider.keep_collection_open:
if reason != 'finished' or spider.pluck or spider.keep_collection_open:
return

response = self.client.end_collection_store({
Expand All @@ -146,7 +146,7 @@ def item_scraped(self, item, spider):
Sends an API request to store the file, file item or file error in Kingfisher Process.
"""

if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem):
if not item.get('post_to_api', True) or isinstance(item, PluckedItem):
return

data = {
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ class FileError(KingfisherItem):
errors = scrapy.Field()


class LatestReleaseDateItem(scrapy.Item):
date = scrapy.Field()
class PluckedItem(scrapy.Item):
value = scrapy.Field()

0 comments on commit 1e27d60

Please sign in to comment.