Merge 4dfcbea into 71bb82e

open-contracting · Mar 26, 2021 · b34488b · b34488b
2 parents 71bb82e + 4dfcbea
commit b34488b
Show file tree

Hide file tree

Showing 16 changed files with 72 additions and 54 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -210,6 +210,7 @@ def build_request(self, url, formatter, **kwargs):
         meta = {'file_name': file_name}
         if 'meta' in kwargs:
             meta.update(kwargs.pop('meta'))
+
         return scrapy.Request(url, meta=meta, **kwargs)
 
     def build_file_from_response(self, response, **kwargs):
@@ -490,8 +491,9 @@ def start_requests(self):
             date_range = util.date_range_by_month(start, stop)
 
         for date in date_range:
-            for url in self.build_urls(date):
-                yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)
+            for number, url in enumerate(self.build_urls(date)):
+                yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback,
+                                         priority=number * -1)
 
     def build_urls(self, date):
         """
@@ -577,8 +579,9 @@ def parse_list(self, response, **kwargs):
             data = response.json()
         except ValueError:
             data = None
-        for value in self.range_generator(data, response):
-            yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter, **kwargs)
+        for number, value in enumerate(self.range_generator(data, response)):
+            yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter,
+                                     priority=number * -1, **kwargs)
 
     def pages_from_total_range_generator(self, data, response):
         pages = resolve_pointer(data, self.total_pages_pointer)
@@ -592,7 +595,10 @@ def pages_url_builder(self, value, data, response):
     def limit_offset_range_generator(self, data, response):
         limit = self._resolve_limit(data)
         count = resolve_pointer(data, self.count_pointer)
-        return range(self.limit, count, limit)
+        if not self.yield_list_results:
+            return range(0, count, limit)
+        else:
+            return range(self.limit, count, limit)
 
     def limit_offset_url_builder(self, value, data, response):
         return self._build_url({

diff --git a/kingfisher_scrapy/commands/pluck.py b/kingfisher_scrapy/commands/pluck.py
@@ -2,7 +2,6 @@
 import logging
 import os
 from collections import defaultdict
-from datetime import datetime
 
 from scrapy.commands import ScrapyCommand
 from scrapy.exceptions import UsageError
@@ -46,7 +45,6 @@ def run(self, args, opts):
         if os.path.isfile(filename):
             os.unlink(filename)
 
-        year = datetime.today().year
         skipped = defaultdict(list)
         running = []
         for spider_name in self.crawler_process.spider_loader.list():
@@ -56,7 +54,7 @@ def run(self, args, opts):
                     skipped[spidercls.skip_pluck].append(spider_name)
                 else:
                     running.append(spider_name)
-                    self.crawler_process.crawl(spidercls, year=year, sample=1, package_pointer=opts.package_pointer,
+                    self.crawler_process.crawl(spidercls, sample=1, package_pointer=opts.package_pointer,
                                                release_pointer=opts.release_pointer, truncate=opts.truncate)
 
         with open('pluck_skipped.json', 'w') as f:

diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py
@@ -19,6 +19,9 @@ class ChileCompraBulk(CompressedFileSpider):
         'DOWNLOAD_FAIL_ON_DATALOSS': False,
     }
 
+    # BaseSpider
+    skip_pluck = 'Already covered (see code for details)'  # chile_compra_records
+
     # SimpleSpider
     data_type = 'record_package'
 

diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py
@@ -21,6 +21,7 @@ class ColombiaBulk(CompressedFileSpider):
     line_delimited = True
     root_path = 'Release'
     root_path_max_length = 1
+    skip_pluck = 'Already covered (see code for details)'  # colombia
 
     # SimpleSpider
     data_type = 'release'

diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -13,6 +13,9 @@ class DominicanRepublic(CompressedFileSpider):
     """
     name = 'dominican_republic'
 
+    # BaseSpider
+    skip_pluck = 'Already covered (see code for details)'  # dominican_republic_api
+
     # SimpleSpider
     data_type = 'release_package'
 

diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal_api.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal_api.py
@@ -16,6 +16,7 @@ class MexicoAdministracionPublicaFederalAPI(IndexSpider):
 
     # BaseSpider
     root_path = 'results.item'
+    skip_pluck = 'Already covered (see code for details)'  # mexico_administracion_publica_federal_bulk
 
     # SimpleSpider
     data_type = 'record_package'

diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py
@@ -10,6 +10,9 @@ class MexicoNuevoLeonReleases(MexicoNuevoLeonBase):
     """
     name = 'mexico_nuevo_leon_releases'
 
+    # BaseSpider
+    skip_pluck = 'Already covered (see code for details)'  # mexico_nuevo_leon_records
+
     # SimpleSpider
     data_type = 'release_package'
 

diff --git a/kingfisher_scrapy/spiders/mexico_plataforma_digital_nacional.py b/kingfisher_scrapy/spiders/mexico_plataforma_digital_nacional.py
@@ -15,6 +15,7 @@ class MexicoPlataformaDigitalNacional(CompressedFileSpider):
 
     # BaseSpider
     root_path = 'item'
+    skip_pluck = 'Already covered (see code for details)'  # mexico_administracion_publica_federal_bulk
 
     # CompressedFileSpider
     data_type = 'release'

diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py
@@ -25,7 +25,7 @@ class MexicoQuienEsQuien(IndexSpider):
     # IndexSpider
     count_pointer = '/data/0/collections/contracts/count'
     limit = 1000
-    base_url = 'https://api.quienesquien.wiki/v2/contracts'
+    base_url = 'https://api.quienesquien.wiki/v2/contracts?sort=-compiledRelease.date'
     formatter = staticmethod(parameters('offset'))
     yield_list_results = False
 

diff --git a/kingfisher_scrapy/spiders/pakistan_ppra_bulk.py b/kingfisher_scrapy/spiders/pakistan_ppra_bulk.py
@@ -11,6 +11,8 @@ class PakistanPPRABulk(SimpleSpider):
       https://www.ppra.org.pk/api/
     """
     name = 'pakistan_ppra_bulk'
+    # BaseSpider
+    skip_pluck = 'Already covered (see code for details)'  # pakistan_ppra_releases
 
     # SimpleSpider
     data_type = 'release_package'

diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py
@@ -46,8 +46,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
 
     def start_requests(self):
         url = f'{self.base_url}/search/processes?tipo_fecha=fecha_release&' \
-              f'fecha_desde={self.from_date.strftime(self.date_format)}&' \
-              f'fecha_hasta={self.until_date.strftime(self.date_format)}'
+              f'fecha_desde={self.from_date.strftime(self.date_format)}-4:00&' \
+              f'fecha_hasta={self.until_date.strftime(self.date_format)}-4:00'
 
         yield self.build_request(
             url,

diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py
@@ -20,6 +20,7 @@ class Portugal(CompressedFileSpider):
 
     # BaseSpider
     line_delimited = True
+    skip_pluck = 'Already covered (see code for details)'  # portugal_releases
 
     # SimpleSpider
     data_type = 'record_package'

diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py
diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py
@@ -1,7 +1,10 @@
-from kingfisher_scrapy.spiders.scotland_base import ScotlandBase
+from datetime import date
 
+from kingfisher_scrapy.base_spider import PeriodicSpider
+from kingfisher_scrapy.util import parameters
 
-class ScotlandPublicContracts(ScotlandBase):
+
+class ScotlandPublicContracts(PeriodicSpider):
     """
     Domain
       Public Contracts Scotland
@@ -15,8 +18,43 @@ class ScotlandPublicContracts(ScotlandBase):
     """
     name = 'scotland_public_contracts'
 
+    # BaseSpider
+    date_format = 'year-month'
+    default_from_date = date(date.today().year - 1, date.today().month, 1)
+
     # SimpleSpider
     data_type = 'release_package'
 
     # PeriodicSpider
     pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
+
+    notice_types = [
+        1,  # OJEU - F1 - Prior Information Notice
+        2,  # OJEU - F2 - Contract Notice
+        3,  # OJEU - F3 - Contract Award Notice
+        4,  # OJEU - F4 - Prior Information Notice(Utilities)
+        5,  # OJEU - F5 - Contract Notice(Utilities)
+        6,  # OJEU - F6 - Contract Award Notice(Utilities)
+        7,  # OJEU - F7 - Qualification Systems(Utilities)
+        12,  # OJEU - F12 - Design Contest Notice
+        13,  # OJEU - F13 - Results Of Design Contest
+        14,  # OJEU - F14 - Corrigendum
+        15,  # OJEU - F15 - Voluntary Ex Ante Transparency Notice
+        20,  # OJEU - F20 - Modification Notice
+        21,  # OJEU - F21 - Social And other Specific Services(Public Contracts)
+        22,  # OJEU - F22 - Social And other Specific Services(Utilities)
+        23,  # OJEU - F23 - Social And other Specific Services(Concessions)
+        24,  # OJEU - F24 - Concession Notice
+        25,  # OJEU - F25 - Concession Award Notice
+        101,  # Site Notice - Website Contract Notice
+        102,  # Site Notice - Website Prior Information Notice
+        103,  # Site Notice - Website Contract Award Notice
+        104,  # Site Notice - Quick Quote Award
+    ]
+
+    def build_urls(self, date):
+        for notice_type in self.notice_types:
+            yield self.pattern.format(date, notice_type)
+
+    def get_formatter(self):
+        return parameters('noticeType', 'dateFrom')
diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py
@@ -22,6 +22,7 @@ class UruguayHistorical(CompressedFileSpider, PeriodicSpider):
     date_format = 'year'
     default_from_date = '2002'
     default_until_date = '2017'
+    skip_pluck = 'Already covered (see code for details)'  # uruguay_releases
 
     # SimpleSpider
     data_type = 'release_package'

diff --git a/kingfisher_scrapy/spiders/zambia.py b/kingfisher_scrapy/spiders/zambia.py
@@ -26,6 +26,7 @@ def start_requests(self):
 
     @handle_http_error
     def parse_list(self, response):
+
         urls = response.json()['packagesPerMonth']
 
         for url in reversed(urls):