Skip to content

Commit

Permalink
Merge 8090d1d into ea9055f
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jul 16, 2020
2 parents ea9055f + 8090d1d commit 358d11a
Show file tree
Hide file tree
Showing 24 changed files with 256 additions and 17 deletions.
5 changes: 4 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class BaseSpider(scrapy.Spider):
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}

def __init__(self, sample=None, note=None, from_date=None, until_date=None,
date_format='date', *args, **kwargs):
date_format='date', latest=None, *args, **kwargs):
super().__init__(*args, **kwargs)

# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
Expand All @@ -54,12 +54,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None,
self.from_date = from_date
self.until_date = until_date
self.date_format = self.VALID_DATE_FORMATS[date_format]
self.latest = latest == 'true'

spider_arguments = {
'sample': sample,
'note': note,
'from_date': from_date,
'until_date': until_date,
'latest': latest,
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand Down Expand Up @@ -304,6 +306,7 @@ def start_requests(self):

encoding = 'utf-8'
zip_file_format = None
skip_latest_release_date = "This command doesn't yet support identifying the latest release in a ZIP file."

@handle_http_error
def parse(self, response):
Expand Down
34 changes: 34 additions & 0 deletions kingfisher_scrapy/commands/latestreleasedate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
from datetime import datetime

from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


class LatestReleaseDatePerPublisher(ScrapyCommand):
def short_desc(self):
return 'Get the latest published release date per publisher'

def run(self, args, opts):
settings = get_project_settings()
settings.set('CLOSESPIDER_ITEMCOUNT', 1)
settings.set('CONCURRENT_REQUESTS', 1)
settings.set('CLOSESPIDER_ERRORCOUNT', 1)

path = settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
os.makedirs(path, exist_ok=True)
os.unlink(os.path.join(path, 'latest_dates.csv'))
filename = os.path.join(path, 'skipped_spiders.txt')

process = CrawlerProcess(settings=settings)
spiders = process.spider_loader.list()
current_year = datetime.today().year
with open(filename, 'w') as output:
for spider in spiders:
spider_cls = process.spider_loader.load(spider)
if hasattr(spider_cls, 'skip_latest_release_date'):
output.write(f'Skipping {spider}. Reason: {spider_cls.skip_latest_release_date}\n')
else:
process.crawl(spider, latest='true', year=current_year)
process.start()
37 changes: 33 additions & 4 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,39 @@
from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.items import File, FileError, FileItem, LatestReleaseDateItem
from kingfisher_scrapy.kingfisher_process import Client


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherLatestDate:
def __init__(self, filename):
self.filename = filename
self.spiders_seen = set()

@classmethod
def from_crawler(cls, crawler):
path = crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
os.makedirs(path, exist_ok=True)
filename = os.path.join(path, 'latest_dates.csv')
extension = cls(filename=filename)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
return extension

def item_scraped(self, item, spider):
if not isinstance(item, LatestReleaseDateItem) or spider.name in self.spiders_seen:
return
self.spiders_seen.add(spider.name)
with open(self.filename, 'a+') as output:
output.write(f"{spider.name},{item['date']}\n")

def spider_closed(self, spider, reason):
if spider.name not in self.spiders_seen:
with open(self.filename, 'a+') as output:
output.write(f"{spider.name},{reason}\n")


class KingfisherFilesStore:
def __init__(self, directory):
self.directory = directory
Expand Down Expand Up @@ -101,7 +130,7 @@ def spider_closed(self, spider, reason):
Sends an API request to end the collection's store step.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
if reason != 'finished':
if reason != 'finished' or spider.latest:
return

response = self.client.end_collection_store({
Expand All @@ -118,9 +147,9 @@ def item_scraped(self, item, spider):
"""
Sends an API request to store the file, file item or file error in Kingfisher Process.
"""
if not item.get('post_to_api', True):
return

if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem):
return
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
Expand Down
4 changes: 4 additions & 0 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ class FileItem(KingfisherItem):

class FileError(KingfisherItem):
errors = scrapy.Field()


class LatestReleaseDateItem(scrapy.Item):
date = scrapy.Field()
7 changes: 4 additions & 3 deletions kingfisher_scrapy/log_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def scraped(self, item, response, spider):
"""
Omits an item's `data` value from the log message.
"""
item = item.copy()
item.pop('data', None)
return super().scraped(item, response, spider)
if item:
item = item.copy()
item.pop('data', None)
return super().scraped(item, response, spider)
49 changes: 48 additions & 1 deletion kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator, RefResolver

from kingfisher_scrapy.items import File, FileItem
from kingfisher_scrapy.items import File, FileItem, LatestReleaseDateItem


def _json_loads(basename):
Expand Down Expand Up @@ -40,3 +40,50 @@ def process_item(self, item, spider):
self.files.add(key)

return item


class LatestReleaseDate:
def __init__(self):
self.processed = set()

def process_item(self, item, spider):
if spider.name in self.processed:
spider.crawler.engine.close_spider(self, reason='processed')
return
if spider.latest and isinstance(item, (File, FileItem)):
date = None
data = json.loads(item['data'])
if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results',
'release_list', 'release', 'compiled_release'):
if item['data_type'] == 'release_package':
data = data['releases']
elif item['data_type'] == 'release_package_list':
data = data[0]['releases']
elif item['data_type'] == 'release_package_list_in_results':
data = data['results'][0]['releases']
if data:
if item['data_type'] in ('release', 'compiled_release'):
date = data['date']
else:
date = max(r['date'] for r in data)
elif item['data_type'] in ('record_package', 'record', 'record_list', 'record_package_list',
'record_package_list_in_results'):
if item['data_type'] == 'record_package':
data = data['records']
elif item['data_type'] == 'record_package_list':
data = data[0]['records']
elif item['data_type'] == 'record_package_list_in_results':
data = data['results'][0]['records']
elif item['data_type'] == 'record':
data = [data]
if data:
# This assumes that the first record in the record package has the most recent date.
data = data[0]
if 'releases' in data:
date = max(r['date'] for r in data['releases'])
elif 'compiledRelease' in data:
date = data['compiledRelease']['date']
self.processed.add(spider.name)
return LatestReleaseDateItem({'date': date})
else:
return item
3 changes: 3 additions & 0 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
#}
EXTENSIONS = {
'kingfisher_scrapy.extensions.SentryLogging': -1,
'kingfisher_scrapy.extensions.KingfisherLatestDate': 1,
# `KingfisherFilesStore` must run before `KingfisherProcessAPI`, because the file needs to be written before the
# request is sent to Kingfisher Process.
'kingfisher_scrapy.extensions.KingfisherFilesStore': 100,
Expand All @@ -80,6 +81,7 @@
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kingfisher_scrapy.pipelines.LatestReleaseDate': 300,
'kingfisher_scrapy.pipelines.Validate': 300,
}

Expand Down Expand Up @@ -131,6 +133,7 @@
# https://docs.scrapy.org/en/latest/topics/media-pipeline.html#std:setting-FILES_STORE
FILES_STORE = os.getenv('FILES_STORE', 'data')

KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH = os.getenv('KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH', 'latest_dates')
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html#httperror-allow-all
HTTPERROR_ALLOW_ALL = True

Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AfghanistanRecords(SimpleSpider):
"""
name = 'afghanistan_records'
data_type = 'record'
skip_latest_release_date = 'Already covered by afghanistan_releases'

download_delay = 1

Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,4 @@ class Australia(LinksSpider):
def start_requests(self):
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z'

yield scrapy.Request(url, meta={'file_name': 'start.json'})
6 changes: 3 additions & 3 deletions kingfisher_scrapy/spiders/canada_buyandsell.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ class CanadaBuyAndSell(SimpleSpider):

def start_requests(self):
urls = [
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
]
if self.sample:
urls = [urls[0]]
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def parse_list(self, response):
# }
yield from self.handle_item(item)

if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']:
if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']\
and not self.sample:
year = response.request.meta['year']
month = response.request.meta['month']
offset = data['pagination']['offset']
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/chile_compra_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ChileCompraRecords(ChileCompraBaseSpider):
"""
name = 'chile_compra_records'
data_type = 'record_package'
skip_latest_release_date = 'Already covered by chile_compra_releases'

def handle_item(self, item):
url = 'https://apis.mercadopublico.cl/OCDS/data/record/' + item['ocid'].replace('ocds-70d2nz-', '')
Expand Down
2 changes: 2 additions & 0 deletions kingfisher_scrapy/spiders/digiwhist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@


class DigiwhistBase(BaseSpider):
skip_latest_release_date = 'Unordered json lines files'

def start_requests(self):
# See scrapy.spiders.Spider.start_requests
for url in self.start_urls:
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DominicanRepublic(BaseSpider):
Downloads a release package for the oldest year (2018, first link in the downloads page).
"""
name = 'dominican_republic'
skip_latest_release_date = "This command doesn't yet support identifying the latest release in a RAR file."

def start_requests(self):
yield scrapy.Request(
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class HondurasONCAE(ZipSpider):
"""
name = 'honduras_oncae'
data_type = 'release_package'
skip_latest_release_date = 'Already covered by honduras_portal_releases'

# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class HondurasPortalBulkFiles(SimpleSpider):
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_latest_release_date = 'Already covered by honduras_portal_releases'

def start_requests(self):
yield scrapy.Request(
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class HondurasPortalRecords(LinksSpider):
data_pointer = '/recordPackage'
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))
skip_latest_release_date = 'Already covered by honduras_portal_releases'

download_delay = 0.9

Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/moldova_old.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class MoldovaOld(SimpleSpider):
"""
name = 'moldova_old'
data_type = 'release_package'
skip_latest_release_date = 'Old endpoint'

def start_requests(self):
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/paraguay_dncp_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class ParaguayDNCPRecords(ParaguayDNCPBaseSpider):
"""
name = 'paraguay_dncp_records'
data_type = 'record_package'
skip_latest_release_date = 'Already covered by paraguay_dncp_releases'

def get_files_to_download(self, content):
for record in content['records']:
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/test_fail.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class TestFail(SimpleSpider):
name = 'test_fail'
data_type = 'release_package'
skip_latest_release_date = 'Not a real spider'

def start_requests(self):
# Fine
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/uk_contracts_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class UKContractsFinder(SimpleSpider):
encoding = 'iso-8859-1'

def start_requests(self):
url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1'
url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=desc&page=1'
yield self.build_request(url, formatter=parameters('page'), callback=self.parse_list)

@handle_http_error
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/uruguay_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class UruguayRecords(UruguayBase):
"""
name = 'uruguay_records'
data_type = 'record_package'
skip_latest_release_date = 'Already covered by uruguay_releases'

@handle_http_error
def parse_list(self, response):
Expand Down

0 comments on commit 358d11a

Please sign in to comment.