Skip to content

Commit

Permalink
Merge branch 'main' into fix-incremental-file-names
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Sep 19, 2023
2 parents 8224fb1 + 49528bf commit c5c8f12
Show file tree
Hide file tree
Showing 33 changed files with 76 additions and 80 deletions.
36 changes: 17 additions & 19 deletions kingfisher_scrapy/base_spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ class BaseSpider(scrapy.Spider):
available_steps = {'compile', 'check'}

def __init__(self, sample=None, path=None, from_date=None, until_date=None, crawl_time=None, note=None,
keep_collection_open=None, steps=None, compile_releases=None, package_pointer=None,
release_pointer=None, truncate=None, table_name=None, *args, **kwargs):
keep_collection_open=None, steps=None, compile_releases=None, table_name=None, package_pointer=None,
release_pointer=None, truncate=None, *args, **kwargs):
"""
:param sample: the number of items to download (``'true'`` means ``1``; ``'false'`` and ``None`` mean no limit)
:param path: path components to append to the URLs yielded by the ``start_requests`` method (see :ref:`filter`)
Expand Down Expand Up @@ -104,35 +104,33 @@ def __init__(self, sample=None, path=None, from_date=None, until_date=None, craw
self.from_date = from_date
self.until_date = until_date

# Related to incremental crawls.
# Related to incremental crawls (whether KingfisherProcessAPI2 data_version or DatabaseStore directory).
self.crawl_time = crawl_time

# Related to Kingfisher Process.
self.note = note
self.keep_collection_open = keep_collection_open == 'true'
# KingfisherProcessAPI2 extension.
self.kingfisher_process_note = note
self.kingfisher_process_keep_collection_open = keep_collection_open == 'true'
if steps is None:
self.steps = self.available_steps
self.kingfisher_process_steps = self.available_steps
else:
self.steps = set(steps.split(',')) & self.available_steps
self.kingfisher_process_steps = set(steps.split(',')) & self.available_steps

# Related to the DatabaseStore extension.
self.compile_releases = compile_releases == 'true'
# DatabaseStore extension.
self.database_store_compile_releases = compile_releases == 'true'
self.database_store_table_name = table_name

# Related to the pluck command.
self.package_pointer = package_pointer
self.release_pointer = release_pointer
self.truncate = int(truncate) if truncate else None

# Related to the database store extension.
self.table_name = table_name
# Pluck pipeline.
self.pluck_package_pointer = package_pointer
self.pluck_release_pointer = release_pointer
self.pluck_truncate = int(truncate) if truncate else None
self.pluck = bool(package_pointer or release_pointer)

self.query_string_parameters = {}
for key, value in kwargs.items():
if key.startswith('qs:'):
self.query_string_parameters[key[3:]] = value

self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.pluck = bool(package_pointer or release_pointer)

if hasattr(self, 'start_requests'):
if path:
Expand Down Expand Up @@ -167,7 +165,7 @@ def __init__(self, sample=None, path=None, from_date=None, until_date=None, craw
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)

if spider.package_pointer and spider.release_pointer:
if spider.pluck_package_pointer and spider.pluck_release_pointer:
raise SpiderArgumentError('You cannot specify both package_pointer and release_pointer spider arguments.')

if spider.sample:
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/commands/checkall.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def check(self):
expected_spider_arguments.update({'from_date', 'until_date'})
elif self.cls.date_required:
expected_spider_arguments.update({'from_date', 'until_date'})
# Ukraine requires a date, but only supports from_date
# Ukraine requires a date, but only supports from_date.
if self.cls.__name__ == 'Ukraine':
expected_spider_arguments.remove('until_date')

Expand Down
12 changes: 7 additions & 5 deletions kingfisher_scrapy/commands/pluck.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ def syntax(self):

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument('-p', '--package-pointer', help='The JSON Pointer to the value in the package')
parser.add_argument('-r', '--release-pointer', help='The JSON Pointer to the value in the release')
parser.add_argument('-p', '--package-pointer', dest='pluck_package_pointer',
help='The JSON Pointer to the value in the package')
parser.add_argument('-r', '--release-pointer', dest='pluck_release_pointer',
help='The JSON Pointer to the value in the release')
parser.add_argument('-t', '--truncate', type=int, help='Truncate the value to this number of characters')
parser.add_argument('--max-bytes', type=int,
help='Stop downloading an OCDS file after reading this many bytes')

def run(self, args, opts):
if not (bool(opts.package_pointer) ^ bool(opts.release_pointer)):
if not (bool(opts.pluck_package_pointer) ^ bool(opts.pluck_release_pointer)):
raise UsageError('Exactly one of --package-pointer or --release-pointer must be set.')

# Stop after one item or error.
Expand Down Expand Up @@ -55,8 +57,8 @@ def run(self, args, opts):
skipped[spidercls.skip_pluck].append(spider_name)
else:
running.append(spider_name)
self.crawler_process.crawl(spidercls, sample=1, package_pointer=opts.package_pointer,
release_pointer=opts.release_pointer, truncate=opts.truncate)
self.crawler_process.crawl(spidercls, sample=1, package_pointer=opts.pluck_package_pointer,
release_pointer=opts.pluck_release_pointer, truncate=opts.truncate)

with open('pluck_skipped.json', 'w') as f:
json.dump(skipped, f, indent=2)
Expand Down
12 changes: 6 additions & 6 deletions kingfisher_scrapy/extensions/database_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ class DatabaseStore:
If the ``DATABASE_URL`` Scrapy setting and the ``crawl_time`` spider argument are set, the OCDS data is stored in a
PostgreSQL database, incrementally.
This extension stores data in the "data" column of a table named after the spider, or ``table_name`` (if set).
When the spider is opened, if the table doesn't exist, it is created. The spider's ``from_date`` attribute is then
set, in order of precedence, to: the ``from_date`` spider argument (unless equal to the spider's
This extension stores data in the "data" column of a table named after the spider, or the ``table_name`` spider
argument (if set). When the spider is opened, if the table doesn't exist, it is created. The spider's ``from_date``
attribute is then set, in order of precedence, to: the ``from_date`` spider argument (unless equal to the spider's
``default_from_date`` class attribute); the maximum value of the ``date`` field of the stored data (if any); the
spider's ``default_from_date`` class attribute (if set).
Expand Down Expand Up @@ -93,7 +93,7 @@ def spider_closed(self, spider, reason):
if reason not in ('finished', 'sample'):
return

if spider.compile_releases:
if spider.database_store_compile_releases:
if 'release' in spider.data_type:
prefix = ''
else:
Expand All @@ -108,7 +108,7 @@ def spider_closed(self, spider, reason):
table_name = self.get_table_name(spider)

data = self.yield_items_from_directory(crawl_directory, prefix)
if spider.compile_releases:
if spider.database_store_compile_releases:
spider.logger.info('Creating generator of compiled releases')
data = merge(data)

Expand Down Expand Up @@ -170,4 +170,4 @@ def execute(self, statement, variables=None, **kwargs):
self.cursor.execute(statement, variables)

def get_table_name(self, spider):
return spider.table_name if spider.table_name else spider.name
return spider.database_store_table_name if spider.database_store_table_name else spider.name
6 changes: 3 additions & 3 deletions kingfisher_scrapy/extensions/kingfisher_process_api2.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ def spider_opened(self, spider):
'source_id': spider.name,
'data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'sample': bool(spider.sample),
'note': spider.note,
'note': spider.kingfisher_process_note,
'job': getattr(spider, '_job', None),
'upgrade': spider.ocds_version == '1.0',
}

for step in spider.steps:
for step in spider.kingfisher_process_steps:
data[step] = True

# This request must be synchronous, to have the collection ID for the item_scraped handler.
Expand All @@ -106,7 +106,7 @@ def spider_closed(self, spider, reason):
"""
Sends an API request to close the collection in Kingfisher Process.
"""
if spider.pluck or spider.keep_collection_open:
if spider.pluck or spider.kingfisher_process_keep_collection_open:
return

if not self.collection_id:
Expand Down
16 changes: 8 additions & 8 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def process_item(self, item, spider):
return item

value = None
if spider.package_pointer:
pointer = spider.package_pointer
if spider.pluck_package_pointer:
pointer = spider.pluck_package_pointer
if isinstance(item['data'], dict):
value = _resolve_pointer(item['data'], pointer)
else:
Expand All @@ -119,7 +119,7 @@ def process_item(self, item, spider):
value = f'error: {pointer} not found within initial bytes'
else:
raise
else: # spider.release_pointer
else: # spider.pluck_release_pointer
if isinstance(item['data'], dict):
data = item['data']
else:
Expand All @@ -128,19 +128,19 @@ def process_item(self, item, spider):
if item['data_type'].startswith('release'):
releases = data['releases']
if releases:
value = max(_resolve_pointer(r, spider.release_pointer) for r in releases)
value = max(_resolve_pointer(r, spider.pluck_release_pointer) for r in releases)
elif item['data_type'].startswith('record'):
records = data['records']
if records:
# This assumes that the first record in the record package has the desired value.
record = records[0]
if 'releases' in record:
value = max(_resolve_pointer(r, spider.release_pointer) for r in record['releases'])
value = max(_resolve_pointer(r, spider.pluck_release_pointer) for r in record['releases'])
elif 'compiledRelease' in record:
value = _resolve_pointer(record['compiledRelease'], spider.release_pointer)
value = _resolve_pointer(record['compiledRelease'], spider.pluck_release_pointer)

if value and spider.truncate:
value = value[:spider.truncate]
if value and spider.pluck_truncate:
value = value[:spider.pluck_truncate]

return PluckedItem({'value': value})

Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class ArgentinaBuenosAires(SimpleSpider):
def start_requests(self):
# A CKAN API JSON response.
url = 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/bolivia_agetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class BoliviaAgetic(SimpleSpider):
def start_requests(self):
# A CKAN API JSON response.
url = 'https://datos.gob.bo/api/3/action/package_show?id=contrataciones-agetic-2019-estandar-ocp'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/canada_quebec.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CanadaQuebec(SimpleSpider):
def start_requests(self):
# A CKAN API JSON response.
url = 'https://www.donneesquebec.ca/api/3/action/package_show?id=d23b2e02-085d-43e5-9e6e-e1d558ebfdd5'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/chile_compra_api_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class ChileCompraAPIBase(IndexSpider, PeriodicSpider):
# PeriodicSpider
# The path parameters are {system}/{year}/{month}/{offset}/{limit}.
pattern = 'http://api.mercadopublico.cl/APISOCDS/OCDS/{0}/{1.year:d}/{1.month:02d}/{2}/{3}'
formatter = staticmethod(components(-4, -1))
formatter = staticmethod(components(-4, -1)) # year-month-offset
start_requests_callback = 'parse_list'

# IndexSpider
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/chile_compra_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ChileCompraBulk(CompressedFileSpider, PeriodicSpider):

# PeriodicSpider
pattern = 'https://ocds.blob.core.windows.net/ocds/{0.year:d}{0.month:02d}.zip'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # filename containing year-month

def build_file(self, file_name=None, url=None, data=None, **kwargs):
json_data = json.loads(data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class CostaRicaPoderJudicialRecords(SimpleSpider):
def start_requests(self):
url = 'https://ckanpj.azurewebsites.net/api/3/action/package_show?id=estandar-de-datos-de' \
'-contrataciones-abiertas-ocds'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class CostaRicaPoderJudicialReleases(CompressedFileSpider):
def start_requests(self):
url = 'https://ckanpj.azurewebsites.net/api/3/action/package_show?id=estandar-de-datos-de-contrataciones' \
'-abiertas-ocds'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/dominican_republic_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DominicanRepublicAPI(LinksSpider, PeriodicSpider):
data_type = 'release_package'

# LinksSpider
formatter = staticmethod(components(-2))
formatter = staticmethod(components(-2)) # year
next_pointer = '/pagination/next'

# PeriodicSpider
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/france.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def start_requests(self):
# https://www.data.gouv.fr/fr/datasets/donnees-essentielles-de-la-commande-publique-fichiers-consolides/
url = 'https://www.data.gouv.fr/api/1/datasets/donnees-essentielles-de-la-commande-publique-fichiers' \
'-consolides/'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_show.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class HondurasONCAE(CompressedFileSpider, PeriodicSpider):

# PeriodicSpider
pattern = 'https://datosabiertos.oncae.gob.hn/datosabiertos/{}'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # year

# Local
available_systems = ['HC1', 'CE', 'DDC']
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_sefin_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class HondurasSEFINAPI(PeriodicSpider):

# PeriodicSpider
pattern = 'https://guancasco.sefin.gob.hn/EDCA_WEBAPI/api/listaOcids/{}'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # year

@handle_http_error
def parse(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_sefin_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ class HondurasSEFINBulk(CompressedFileSpider, PeriodicSpider):

# PeriodicSpider
pattern = 'https://piep.sefin.gob.hn/edca/ocid_sefin_{}.zip'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # filename containing year
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/india_assam_finance_department.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class IndiaAssamFinanceDepartment(PeriodicSpider):

# PeriodicSpider
date_format = 'year'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # filename containing year
default_from_date = '2016'
default_until_date = '2021'

Expand All @@ -38,4 +38,4 @@ def build_urls(self, date):
Yields one or more URLs for the given date.
"""
url = 'https://data.gov.in/files/ogdpv2dms/s3fs-public/ocds_mapped_procurement_data_fiscal_year'
yield f'{url}_{date}_{date+1}.csv'
yield f'{url}_{date}_{date + 1}.csv'
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/italy_anac.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class ItalyANAC(SimpleSpider):

def start_requests(self):
url = 'https://dati.anticorruzione.it/opendata/api/3/action/package_search?q=ocds'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)
yield scrapy.Request(url, meta={'file_name': 'package_search.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/moldova_old.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ class MoldovaOld(PeriodicSpider):

# PeriodicSpider
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # year
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/nepal.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ class Nepal(PeriodicSpider):

# PeriodicSpider
pattern = 'http://ppip.gov.np/bulk-download/{}'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # year
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/nigeria_abia_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ class NigeriaAbiaState(PeriodicSpider):

# PeriodicSpider
pattern = 'http://abiaeprocurement.ab.gov.ng/media/ocds{}.json'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # filename containing year
6 changes: 1 addition & 5 deletions kingfisher_scrapy/spiders/nigeria_budeshi_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,7 @@ class NigeriaBudeshiBase(SimpleSpider):
base_url = 'https://budeshi.ng/api/'

def start_requests(self):
yield scrapy.Request(
f'{self.base_url}project_list',
meta={'file_name': 'project_list.json'},
callback=self.parse_list
)
yield scrapy.Request(f'{self.base_url}project_list', meta={'file_name': 'list.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def start_requests(self):

@handle_http_error
def parse_list(self, response):
formatter = join(components(-1), parameters('year', 'month'))
formatter = join(components(-1), parameters('year', 'month')) # format
for item in response.json():
date = datetime(item['year'], item['month'], 1)

Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/nigeria_ebonyi_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class NigeriaEbonyiState(PeriodicSpider):

# PeriodicSpider
pattern = 'http://ebonyieprocure.eb.gov.ng/media/ocds{}.json'
formatter = staticmethod(components(-1))
formatter = staticmethod(components(-1)) # filename containing year

# SimpleSpider
data_type = 'release_package'
Expand Down
Loading

0 comments on commit c5c8f12

Please sign in to comment.