Skip to content

Commit

Permalink
Merge 4dfcbea into 71bb82e
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Mar 26, 2021
2 parents 71bb82e + 4dfcbea commit b34488b
Show file tree
Hide file tree
Showing 16 changed files with 72 additions and 54 deletions.
16 changes: 11 additions & 5 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def build_request(self, url, formatter, **kwargs):
meta = {'file_name': file_name}
if 'meta' in kwargs:
meta.update(kwargs.pop('meta'))

return scrapy.Request(url, meta=meta, **kwargs)

def build_file_from_response(self, response, **kwargs):
Expand Down Expand Up @@ -490,8 +491,9 @@ def start_requests(self):
date_range = util.date_range_by_month(start, stop)

for date in date_range:
for url in self.build_urls(date):
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)
for number, url in enumerate(self.build_urls(date)):
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback,
priority=number * -1)

def build_urls(self, date):
"""
Expand Down Expand Up @@ -577,8 +579,9 @@ def parse_list(self, response, **kwargs):
data = response.json()
except ValueError:
data = None
for value in self.range_generator(data, response):
yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter, **kwargs)
for number, value in enumerate(self.range_generator(data, response)):
yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter,
priority=number * -1, **kwargs)

def pages_from_total_range_generator(self, data, response):
pages = resolve_pointer(data, self.total_pages_pointer)
Expand All @@ -592,7 +595,10 @@ def pages_url_builder(self, value, data, response):
def limit_offset_range_generator(self, data, response):
limit = self._resolve_limit(data)
count = resolve_pointer(data, self.count_pointer)
return range(self.limit, count, limit)
if not self.yield_list_results:
return range(0, count, limit)
else:
return range(self.limit, count, limit)

def limit_offset_url_builder(self, value, data, response):
return self._build_url({
Expand Down
4 changes: 1 addition & 3 deletions kingfisher_scrapy/commands/pluck.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import logging
import os
from collections import defaultdict
from datetime import datetime

from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
Expand Down Expand Up @@ -46,7 +45,6 @@ def run(self, args, opts):
if os.path.isfile(filename):
os.unlink(filename)

year = datetime.today().year
skipped = defaultdict(list)
running = []
for spider_name in self.crawler_process.spider_loader.list():
Expand All @@ -56,7 +54,7 @@ def run(self, args, opts):
skipped[spidercls.skip_pluck].append(spider_name)
else:
running.append(spider_name)
self.crawler_process.crawl(spidercls, year=year, sample=1, package_pointer=opts.package_pointer,
self.crawler_process.crawl(spidercls, sample=1, package_pointer=opts.package_pointer,
release_pointer=opts.release_pointer, truncate=opts.truncate)

with open('pluck_skipped.json', 'w') as f:
Expand Down
3 changes: 3 additions & 0 deletions kingfisher_scrapy/spiders/chile_compra_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class ChileCompraBulk(CompressedFileSpider):
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}

# BaseSpider
skip_pluck = 'Already covered (see code for details)' # chile_compra_records

# SimpleSpider
data_type = 'record_package'

Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class ColombiaBulk(CompressedFileSpider):
line_delimited = True
root_path = 'Release'
root_path_max_length = 1
skip_pluck = 'Already covered (see code for details)' # colombia

# SimpleSpider
data_type = 'release'
Expand Down
3 changes: 3 additions & 0 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ class DominicanRepublic(CompressedFileSpider):
"""
name = 'dominican_republic'

# BaseSpider
skip_pluck = 'Already covered (see code for details)' # dominican_republic_api

# SimpleSpider
data_type = 'release_package'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class MexicoAdministracionPublicaFederalAPI(IndexSpider):

# BaseSpider
root_path = 'results.item'
skip_pluck = 'Already covered (see code for details)' # mexico_administracion_publica_federal_bulk

# SimpleSpider
data_type = 'record_package'
Expand Down
3 changes: 3 additions & 0 deletions kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ class MexicoNuevoLeonReleases(MexicoNuevoLeonBase):
"""
name = 'mexico_nuevo_leon_releases'

# BaseSpider
skip_pluck = 'Already covered (see code for details)' # mexico_nuevo_leon_records

# SimpleSpider
data_type = 'release_package'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class MexicoPlataformaDigitalNacional(CompressedFileSpider):

# BaseSpider
root_path = 'item'
skip_pluck = 'Already covered (see code for details)' # mexico_administracion_publica_federal_bulk

# CompressedFileSpider
data_type = 'release'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/mexico_quien_es_quien.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class MexicoQuienEsQuien(IndexSpider):
# IndexSpider
count_pointer = '/data/0/collections/contracts/count'
limit = 1000
base_url = 'https://api.quienesquien.wiki/v2/contracts'
base_url = 'https://api.quienesquien.wiki/v2/contracts?sort=-compiledRelease.date'
formatter = staticmethod(parameters('offset'))
yield_list_results = False

Expand Down
2 changes: 2 additions & 0 deletions kingfisher_scrapy/spiders/pakistan_ppra_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class PakistanPPRABulk(SimpleSpider):
https://www.ppra.org.pk/api/
"""
name = 'pakistan_ppra_bulk'
# BaseSpider
skip_pluck = 'Already covered (see code for details)' # pakistan_ppra_releases

# SimpleSpider
data_type = 'release_package'
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/paraguay_dncp_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def from_crawler(cls, crawler, *args, **kwargs):

def start_requests(self):
url = f'{self.base_url}/search/processes?tipo_fecha=fecha_release&' \
f'fecha_desde={self.from_date.strftime(self.date_format)}&' \
f'fecha_hasta={self.until_date.strftime(self.date_format)}'
f'fecha_desde={self.from_date.strftime(self.date_format)}-4:00&' \
f'fecha_hasta={self.until_date.strftime(self.date_format)}-4:00'

yield self.build_request(
url,
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/portugal.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Portugal(CompressedFileSpider):

# BaseSpider
line_delimited = True
skip_pluck = 'Already covered (see code for details)' # portugal_releases

# SimpleSpider
data_type = 'record_package'
Expand Down
41 changes: 0 additions & 41 deletions kingfisher_scrapy/spiders/scotland_base.py

This file was deleted.

42 changes: 40 additions & 2 deletions kingfisher_scrapy/spiders/scotland_public_contracts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from kingfisher_scrapy.spiders.scotland_base import ScotlandBase
from datetime import date

from kingfisher_scrapy.base_spider import PeriodicSpider
from kingfisher_scrapy.util import parameters

class ScotlandPublicContracts(ScotlandBase):

class ScotlandPublicContracts(PeriodicSpider):
"""
Domain
Public Contracts Scotland
Expand All @@ -15,8 +18,43 @@ class ScotlandPublicContracts(ScotlandBase):
"""
name = 'scotland_public_contracts'

# BaseSpider
date_format = 'year-month'
default_from_date = date(date.today().year - 1, date.today().month, 1)

# SimpleSpider
data_type = 'release_package'

# PeriodicSpider
pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'

notice_types = [
1, # OJEU - F1 - Prior Information Notice
2, # OJEU - F2 - Contract Notice
3, # OJEU - F3 - Contract Award Notice
4, # OJEU - F4 - Prior Information Notice(Utilities)
5, # OJEU - F5 - Contract Notice(Utilities)
6, # OJEU - F6 - Contract Award Notice(Utilities)
7, # OJEU - F7 - Qualification Systems(Utilities)
12, # OJEU - F12 - Design Contest Notice
13, # OJEU - F13 - Results Of Design Contest
14, # OJEU - F14 - Corrigendum
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice
20, # OJEU - F20 - Modification Notice
21, # OJEU - F21 - Social And other Specific Services(Public Contracts)
22, # OJEU - F22 - Social And other Specific Services(Utilities)
23, # OJEU - F23 - Social And other Specific Services(Concessions)
24, # OJEU - F24 - Concession Notice
25, # OJEU - F25 - Concession Award Notice
101, # Site Notice - Website Contract Notice
102, # Site Notice - Website Prior Information Notice
103, # Site Notice - Website Contract Award Notice
104, # Site Notice - Quick Quote Award
]

def build_urls(self, date):
for notice_type in self.notice_types:
yield self.pattern.format(date, notice_type)

def get_formatter(self):
return parameters('noticeType', 'dateFrom')
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/uruguay_historical.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class UruguayHistorical(CompressedFileSpider, PeriodicSpider):
date_format = 'year'
default_from_date = '2002'
default_until_date = '2017'
skip_pluck = 'Already covered (see code for details)' # uruguay_releases

# SimpleSpider
data_type = 'release_package'
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/zambia.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def start_requests(self):

@handle_http_error
def parse_list(self, response):

urls = response.json()['packagesPerMonth']

for url in reversed(urls):
Expand Down

0 comments on commit b34488b

Please sign in to comment.