Skip to content

Commit

Permalink
Merge 2c92305 into c6c1880
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jul 16, 2020
2 parents c6c1880 + 2c92305 commit b2f8ea5
Show file tree
Hide file tree
Showing 25 changed files with 257 additions and 24 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ venv/
/data
/docs/_build
/htmlcov
/latestreleasedate
5 changes: 4 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class BaseSpider(scrapy.Spider):
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}

def __init__(self, sample=None, note=None, from_date=None, until_date=None,
date_format='date', *args, **kwargs):
date_format='date', latest=None, *args, **kwargs):
super().__init__(*args, **kwargs)

# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
Expand All @@ -56,12 +56,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None,
self.from_date = from_date
self.until_date = until_date
self.date_format = self.VALID_DATE_FORMATS[date_format]
self.latest = latest == 'true'

spider_arguments = {
'sample': sample,
'note': note,
'from_date': from_date,
'until_date': until_date,
'latest': latest,
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand Down Expand Up @@ -305,6 +307,7 @@ def start_requests(self):
"""

encoding = 'utf-8'
skip_latest_release_date = "This command doesn't yet support identifying the latest release in a archive file."
compressed_file_format = None
archive_format = 'zip'
file_name_must_contain = ''
Expand Down
10 changes: 3 additions & 7 deletions kingfisher_scrapy/commands/dryrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,15 @@ def run(self, args, opts):
# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)

# Disable Kingfisher, Telnet, LogStats extensions.
self.settings.set('EXTENSIONS', {
'scrapy.extensions.telnet.TelnetConsole': None,
})
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Disable custom and Telnet extensions.
self.settings.set('EXTENSIONS', {'scrapy.extensions.telnet.TelnetConsole': None})

runner = CrawlerProcess(settings=self.settings)

exceptions = {
'test_fail',
# Server unavailable
'mexico_cdmx',
# Require authentication
'openopps',
'paraguay_dncp_records',
Expand Down
45 changes: 45 additions & 0 deletions kingfisher_scrapy/commands/latestreleasedate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import os
from collections import defaultdict
from datetime import datetime

from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess


class LatestReleaseDatePerPublisher(ScrapyCommand):
def short_desc(self):
return 'Get the latest published release date per publisher'

def run(self, args, opts):
# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Limit concurrent requests, to download the minimum.
self.settings.set('CONCURRENT_REQUESTS', 1)

path = self.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
os.makedirs(path, exist_ok=True)
filename = os.path.join(path, 'dates.csv')
if os.path.isfile(filename):
os.unlink(filename)

runner = CrawlerProcess(settings=self.settings)

year = datetime.today().year
skipped = defaultdict(list)
for spider_name in runner.spider_loader.list():
if spider_name != 'test_fail':
spidercls = runner.spider_loader.load(spider_name)
if hasattr(spidercls, 'skip_latest_release_date'):
skipped[spidercls.skip_latest_release_date].append(spider_name)
else:
runner.crawl(spidercls, latest='true', year=year)

filename = os.path.join(path, 'skipped.json')
with open(filename, 'w') as f:
json.dump(skipped, f, indent=2)

runner.start()
37 changes: 33 additions & 4 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,39 @@
from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.items import File, FileError, FileItem, LatestReleaseDateItem
from kingfisher_scrapy.kingfisher_process import Client


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherLatestDate:
def __init__(self, filename):
self.filename = filename
self.spiders_seen = set()

@classmethod
def from_crawler(cls, crawler):
path = crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
os.makedirs(path, exist_ok=True)
filename = os.path.join(path, 'dates.csv')
extension = cls(filename=filename)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
return extension

def item_scraped(self, item, spider):
if not isinstance(item, LatestReleaseDateItem) or spider.name in self.spiders_seen:
return
self.spiders_seen.add(spider.name)
with open(self.filename, 'a+') as output:
output.write(f"{item['date']},{spider.name}\n")

def spider_closed(self, spider, reason):
if spider.name not in self.spiders_seen:
with open(self.filename, 'a+') as output:
output.write(f"{reason},{spider.name}\n")


class KingfisherFilesStore:
def __init__(self, directory):
self.directory = directory
Expand Down Expand Up @@ -101,7 +130,7 @@ def spider_closed(self, spider, reason):
Sends an API request to end the collection's store step.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
if reason != 'finished':
if reason != 'finished' or spider.latest:
return

response = self.client.end_collection_store({
Expand All @@ -118,9 +147,9 @@ def item_scraped(self, item, spider):
"""
Sends an API request to store the file, file item or file error in Kingfisher Process.
"""
if not item.get('post_to_api', True):
return

if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem):
return
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
Expand Down
4 changes: 4 additions & 0 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ class FileItem(KingfisherItem):

class FileError(KingfisherItem):
errors = scrapy.Field()


class LatestReleaseDateItem(scrapy.Item):
date = scrapy.Field()
11 changes: 9 additions & 2 deletions kingfisher_scrapy/log_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@

class KingfisherLogFormatter(LogFormatter):
# https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped
def scraped(self, item, response, spider):
def scraped(self, item, *args):
return self._omit_data('scraped', item, *args)

# https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.dropped
def dropped(self, item, *args):
return self._omit_data('dropped', item, *args)

def _omit_data(self, method, item, *args):
"""
Omits an item's `data` value from the log message.
"""
item = item.copy()
item.pop('data', None)
return super().scraped(item, response, spider)
return getattr(super(), method)(item, *args)
63 changes: 62 additions & 1 deletion kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator, RefResolver
from scrapy.exceptions import DropItem

from kingfisher_scrapy.items import File, FileItem
from kingfisher_scrapy.items import File, FileItem, LatestReleaseDateItem


def _json_loads(basename):
Expand Down Expand Up @@ -40,3 +41,63 @@ def process_item(self, item, spider):
self.files.add(key)

return item


class LatestReleaseDate:
def __init__(self):
self.processed = set()

def process_item(self, item, spider):
# Skip this pipeline stage unless the spider is explicitly configured to get the latest release date.
if not spider.latest:
return item

# Drop any extra items that are yielded before the spider closes.
if spider.name in self.processed:
spider.crawler.engine.close_spider(spider, reason='processed')
raise DropItem()

# Drop FileError items, so that we keep trying to get data.
if not isinstance(item, (File, FileItem)):
raise DropItem()

date = None
data = json.loads(item['data'])

if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results',
'release_list', 'release', 'compiled_release'):
if item['data_type'] == 'release_package':
data = data['releases']
elif item['data_type'] == 'release_package_list':
data = data[0]['releases']
elif item['data_type'] == 'release_package_list_in_results':
data = data['results'][0]['releases']
if data:
if item['data_type'] in ('release', 'compiled_release'):
date = data['date']
else:
date = max(r['date'] for r in data)
elif item['data_type'] in ('record_package', 'record_package_list', 'record_package_list_in_results',
'record_list', 'record'):
if item['data_type'] == 'record_package':
data = data['records']
elif item['data_type'] == 'record_package_list':
data = data[0]['records']
elif item['data_type'] == 'record_package_list_in_results':
data = data['results'][0]['records']
elif item['data_type'] == 'record':
data = [data]
if data:
# This assumes that the first record in the record package has the most recent date.
data = data[0]
if 'releases' in data:
date = max(r['date'] for r in data['releases'])
elif 'compiledRelease' in data:
date = data['compiledRelease']['date']

self.processed.add(spider.name)

if date:
date = date[:10]

return LatestReleaseDateItem({'date': date})
3 changes: 3 additions & 0 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
#}
EXTENSIONS = {
'kingfisher_scrapy.extensions.SentryLogging': -1,
'kingfisher_scrapy.extensions.KingfisherLatestDate': 1,
# `KingfisherFilesStore` must run before `KingfisherProcessAPI`, because the file needs to be written before the
# request is sent to Kingfisher Process.
'kingfisher_scrapy.extensions.KingfisherFilesStore': 100,
Expand All @@ -81,6 +82,7 @@
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kingfisher_scrapy.pipelines.Validate': 300,
'kingfisher_scrapy.pipelines.LatestReleaseDate': 301,
}


Expand Down Expand Up @@ -131,6 +133,7 @@
# https://docs.scrapy.org/en/latest/topics/media-pipeline.html#std:setting-FILES_STORE
FILES_STORE = os.getenv('FILES_STORE', 'data')

KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH = os.getenv('KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH', 'latestreleasedate')
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html#httperror-allow-all
HTTPERROR_ALLOW_ALL = True

Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AfghanistanRecords(SimpleSpider):
"""
name = 'afghanistan_records'
data_type = 'record'
skip_latest_release_date = 'Already covered (see code for details)' # afghanistan_releases

download_delay = 1

Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,4 @@ class Australia(LinksSpider):
def start_requests(self):
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z'

yield scrapy.Request(url, meta={'file_name': 'start.json'})
6 changes: 3 additions & 3 deletions kingfisher_scrapy/spiders/canada_buyandsell.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ class CanadaBuyAndSell(SimpleSpider):

def start_requests(self):
urls = [
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
]
if self.sample:
urls = [urls[0]]
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def parse_list(self, response):
# }
yield from self.handle_item(item)

if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']:
if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']\
and not self.sample:
year = response.request.meta['year']
month = response.request.meta['month']
offset = data['pagination']['offset']
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/chile_compra_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ChileCompraRecords(ChileCompraBaseSpider):
"""
name = 'chile_compra_records'
data_type = 'record_package'
skip_latest_release_date = 'Already covered (see code for details)' # chile_compra_releases

def handle_item(self, item):
url = 'https://apis.mercadopublico.cl/OCDS/data/record/' + item['ocid'].replace('ocds-70d2nz-', '')
Expand Down
2 changes: 2 additions & 0 deletions kingfisher_scrapy/spiders/digiwhist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@


class DigiwhistBase(BaseSpider):
skip_latest_release_date = 'JSON Lines is not supported'

def start_requests(self):
# See scrapy.spiders.Spider.start_requests
for url in self.start_urls:
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class HondurasONCAE(CompressedFileSpider):
"""
name = 'honduras_oncae'
data_type = 'release_package'
skip_latest_release_date = 'Already covered (see code for details)' # honduras_portal_releases

# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class HondurasPortalBulkFiles(SimpleSpider):
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_latest_release_date = 'Already covered (see code for details)' # honduras_portal_releases

def start_requests(self):
yield scrapy.Request(
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class HondurasPortalRecords(LinksSpider):
data_pointer = '/recordPackage'
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))
skip_latest_release_date = 'Already covered (see code for details)' # honduras_portal_releases

download_delay = 0.9

Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/paraguay_dncp_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class ParaguayDNCPRecords(ParaguayDNCPBaseSpider):
"""
name = 'paraguay_dncp_records'
data_type = 'record_package'
skip_latest_release_date = 'Already covered (see code for details)' # paraguay_dncp_releases

def get_files_to_download(self, content):
for record in content['records']:
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/test_fail.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class TestFail(SimpleSpider):
name = 'test_fail'
data_type = 'release_package'
skip_latest_release_date = 'Not a real spider'

def start_requests(self):
# Fine
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/uk_contracts_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class UKContractsFinder(SimpleSpider):
encoding = 'iso-8859-1'

def start_requests(self):
url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1'
url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=desc&page=1'
yield self.build_request(url, formatter=parameters('page'), callback=self.parse_list)

@handle_http_error
Expand Down

0 comments on commit b2f8ea5

Please sign in to comment.