Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dryrun command and clean up issues it identified #413

Merged
merged 15 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def start_requests(self):
@handle_error
def parse(self, response):
if self.zip_file_format:
self.build_file_from_response(response, data_type='zip', post_to_api=False)
yield self.build_file_from_response(response, data_type='zip', post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand Down
47 changes: 47 additions & 0 deletions kingfisher_scrapy/commands/dryrun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess

from kingfisher_scrapy.base_spider import BaseSpider, ZipSpider


def yield_nothing(*args, **kwargs):
yield


class DryRun(ScrapyCommand):
def short_desc(self):
return 'Run a dry run of all spiders'

def run(self, args, opts):
BaseSpider.parse_json_lines = yield_nothing
ZipSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)

# Disable Kingfisher, Telnet, LogStats extensions.
self.settings.set('EXTENSIONS', {
'scrapy.extensions.telnet.TelnetConsole': None,
})
self.settings.set('LOGSTATS_INTERVAL', None)

runner = CrawlerProcess(settings=self.settings)

exceptions = {
'test_fail',
# Server unavailable
'mexico_cdmx',
# Require authentication
'openopps',
'paraguay_dncp_records',
'paraguay_dncp_releases',
'paraguay_hacienda',
}

for spider_name in runner.spider_loader.list():
if spider_name not in exceptions:
spidercls = runner.spider_loader.load(spider_name)
runner.crawl(spidercls)

runner.start()
5 changes: 5 additions & 0 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@ def __init__(self, directory):
@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings['FILES_STORE']

if not directory:
raise NotConfigured('FILES_STORE is not set.')

extension = cls(directory)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)

return extension

def item_scraped(self, item, spider):
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/log_formatter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from scrapy import logformatter
from scrapy.logformatter import LogFormatter


class KingfisherLogFormatter(logformatter.LogFormatter):
class KingfisherLogFormatter(LogFormatter):
# https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped
def scraped(self, item, response, spider):
"""
Expand Down
3 changes: 3 additions & 0 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

# The maximum response size (in bytes) that downloader will download (default: 1073741824):
DOWNLOAD_MAXSIZE = 4000000000
DOWNLOAD_WARNSIZE = 0
# Many spiders time out when using default of 180.
DOWNLOAD_TIMEOUT = 360

# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Armenia(LinksSpider):
name = 'armenia'
data_type = 'release_package'
next_pointer = '/next_page/uri'
next_page_formatter = parameters('offset')
next_page_formatter = staticmethod(parameters('offset'))

def start_requests(self):
url = 'https://armeps.am/ocds/release'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class Australia(LinksSpider):
name = 'australia'
data_type = 'release_package'
next_page_formatter = parameters('cursor')
next_page_formatter = staticmethod(parameters('cursor'))

def start_requests(self):
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ChileCompraBaseSpider(SimpleSpider):
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}
download_timeout = 300

limit = 100
base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}'

Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/chile_compra_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class ChileCompraBulk(ZipSpider):
name = 'chile_compra_bulk'
data_type = 'record_package'

download_warnsize = 0
download_timeout = 99999
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/colombia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class Colombia(LinksSpider):
name = 'colombia'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ class ColombiaBulk(ZipSpider):
encoding = 'iso-8859-1'
zip_file_format = 'json_lines'

download_warnsize = 0
download_timeout = 99999
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
Expand Down
2 changes: 0 additions & 2 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
class DominicanRepublic(BaseSpider):
name = 'dominican_republic'

download_timeout = 360 # 6min

def start_requests(self):
yield scrapy.Request(
'https://www.dgcp.gob.do/estandar-mundial-ocds/',
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/georgia_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class GeorgiaRecords(LinksSpider):
name = 'georgia_records'
data_type = 'record_package'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
url = 'https://odapi.spa.ge/api/records.json'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/georgia_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class GeorgiaReleases(LinksSpider):
name = 'georgia_releases'
data_type = 'release_package'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
url = 'https://odapi.spa.ge/api/releases.json'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class HondurasPortalRecords(LinksSpider):
data_type = 'record_package'
data_pointer = '/recordPackage'
next_pointer = '/next'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

download_delay = 0.9

Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_portal_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class HondurasPortalReleases(LinksSpider):
data_type = 'release_package'
data_pointer = '/releasePackage'
next_pointer = '/next'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

download_delay = 0.9

Expand Down
3 changes: 1 addition & 2 deletions kingfisher_scrapy/spiders/openopps.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.username = crawler.settings.get('KINGFISHER_OPENOPPS_USERNAME')
spider.password = crawler.settings.get('KINGFISHER_OPENOPPS_PASSWORD')
if spider.username is None or spider.password is None:
spider.logger.error('Please set the environment variables '
'KINGFISHER_OPENOPPS_USERNAME and KINGFISHER_OPENOPPS_PASSWORD')
spider.logger.error('KINGFISHER_OPENOPPS_USERNAME and/or KINGFISHER_OPENOPPS_PASSWORD is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
3 changes: 1 addition & 2 deletions kingfisher_scrapy/spiders/paraguay_dncp_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import logging
from datetime import datetime

import scrapy
Expand Down Expand Up @@ -44,7 +43,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN')

if spider.request_token is None:
logging.error('No request token available')
spider.logger.error('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/paraguay_hacienda.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN')
spider.client_secret = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET')
if spider.request_token is None or spider.client_secret is None:
spider.logger.error('No request token or client secret available')
spider.logger.error('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or '
'KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/portugal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class Portugal(ZipSpider):
encoding = 'iso-8859-1'
zip_file_format = 'json_lines'

download_warnsize = 0
download_timeout = 9999

def start_requests(self):
Expand Down
8 changes: 4 additions & 4 deletions kingfisher_scrapy/spiders/uk_contracts_finder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import json

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.util import components, handle_error, parameters, replace_parameter
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error, parameters, replace_parameter


class UKContractsFinder(BaseSpider):
class UKContractsFinder(SimpleSpider):
name = 'uk_contracts_finder'
data_type = 'release_package_list_in_results'
encoding = 'iso-8859-1'
Expand All @@ -22,4 +22,4 @@ def parse_list(self, response):
total = data['maxPage']
for page in range(2, total + 1):
url = replace_parameter(response.request.url, 'page', page)
yield self.build_request(url, formatter=components('page'))
yield self.build_request(url, formatter=parameters('page'))
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/uk_fts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class UKContractsFinder(LinksSpider):
name = 'uk_fts'
data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results'
next_page_formatter = parameters('cursor')
next_page_formatter = staticmethod(parameters('cursor'))

def start_requests(self):
# This URL was provided by the publisher and is not the production URL.
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def replace_parameter(url, key, value):
parsed = urlsplit(url)
query = parse_qs(parsed.query)
if value is None:
del query[key]
query.pop(key, None)
else:
query[key] = [value]
return parsed._replace(query=urlencode(query, doseq=True)).geturl()
Expand Down
12 changes: 12 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest

from kingfisher_scrapy.util import replace_parameter


@pytest.mark.parametrize('url,value,expected', [
('http://example.com/?page=1', 2, 'http://example.com/?page=2'),
('http://example.com/?page=1', None, 'http://example.com/'),
('http://example.com/', None, 'http://example.com/'),
])
def test_replace_parameter(url, value, expected):
assert replace_parameter(url, 'page', value) == expected
20 changes: 19 additions & 1 deletion tests/test_zip_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,18 @@ def test_parse_json_lines(sample, len_items):

response = response_fixture(body=io.getvalue())
generator = spider.parse(response)
item = next(generator)
items = list(generator)

# assert len(items) == len_items
assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
assert item['post_to_api'] is False

assert len(items) == len_items

for i, item in enumerate(items, 1):
assert type(item) is FileItem
Expand Down Expand Up @@ -83,8 +92,17 @@ def test_parse_release_package(sample, len_items, len_releases):

response = response_fixture(body=io.getvalue())
generator = spider.parse(response)
item = next(generator)
items = list(generator)

assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
assert item['post_to_api'] is False

assert len(items) == len_items

for i, item in enumerate(items, 1):
Expand Down