Merge 8090d1d into ea9055f

open-contracting · Jul 16, 2020 · 358d11a · 358d11a
2 parents ea9055f + 8090d1d
commit 358d11a
Show file tree

Hide file tree

Showing 24 changed files with 256 additions and 17 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -45,7 +45,7 @@ class BaseSpider(scrapy.Spider):
     VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}
 
     def __init__(self, sample=None, note=None, from_date=None, until_date=None,
-                 date_format='date', *args, **kwargs):
+                 date_format='date', latest=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
@@ -54,12 +54,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None,
         self.from_date = from_date
         self.until_date = until_date
         self.date_format = self.VALID_DATE_FORMATS[date_format]
+        self.latest = latest == 'true'
 
         spider_arguments = {
             'sample': sample,
             'note': note,
             'from_date': from_date,
             'until_date': until_date,
+            'latest': latest,
         }
         spider_arguments.update(kwargs)
         self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
@@ -304,6 +306,7 @@ def start_requests(self):
 
     encoding = 'utf-8'
     zip_file_format = None
+    skip_latest_release_date = "This command doesn't yet support identifying the latest release in a ZIP file."
 
     @handle_http_error
     def parse(self, response):

diff --git a/kingfisher_scrapy/commands/latestreleasedate.py b/kingfisher_scrapy/commands/latestreleasedate.py
@@ -0,0 +1,34 @@
+import os
+from datetime import datetime
+
+from scrapy.commands import ScrapyCommand
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+
+class LatestReleaseDatePerPublisher(ScrapyCommand):
+    def short_desc(self):
+        return 'Get the latest published release date per publisher'
+
+    def run(self, args, opts):
+        settings = get_project_settings()
+        settings.set('CLOSESPIDER_ITEMCOUNT', 1)
+        settings.set('CONCURRENT_REQUESTS', 1)
+        settings.set('CLOSESPIDER_ERRORCOUNT', 1)
+
+        path = settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
+        os.makedirs(path, exist_ok=True)
+        os.unlink(os.path.join(path, 'latest_dates.csv'))
+        filename = os.path.join(path, 'skipped_spiders.txt')
+
+        process = CrawlerProcess(settings=settings)
+        spiders = process.spider_loader.list()
+        current_year = datetime.today().year
+        with open(filename, 'w') as output:
+            for spider in spiders:
+                spider_cls = process.spider_loader.load(spider)
+                if hasattr(spider_cls, 'skip_latest_release_date'):
+                    output.write(f'Skipping {spider}. Reason: {spider_cls.skip_latest_release_date}\n')
+                else:
+                    process.crawl(spider, latest='true', year=current_year)
+        process.start()
diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py
@@ -7,10 +7,39 @@
 from scrapy import signals
 from scrapy.exceptions import NotConfigured
 
-from kingfisher_scrapy.items import File, FileError, FileItem
+from kingfisher_scrapy.items import File, FileError, FileItem, LatestReleaseDateItem
 from kingfisher_scrapy.kingfisher_process import Client
 
 
+# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
+class KingfisherLatestDate:
+    def __init__(self, filename):
+        self.filename = filename
+        self.spiders_seen = set()
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        path = crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH']
+        os.makedirs(path, exist_ok=True)
+        filename = os.path.join(path, 'latest_dates.csv')
+        extension = cls(filename=filename)
+        crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
+        crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
+        return extension
+
+    def item_scraped(self, item, spider):
+        if not isinstance(item, LatestReleaseDateItem) or spider.name in self.spiders_seen:
+            return
+        self.spiders_seen.add(spider.name)
+        with open(self.filename, 'a+') as output:
+            output.write(f"{spider.name},{item['date']}\n")
+
+    def spider_closed(self, spider, reason):
+        if spider.name not in self.spiders_seen:
+            with open(self.filename, 'a+') as output:
+                output.write(f"{spider.name},{reason}\n")
+
+
 class KingfisherFilesStore:
     def __init__(self, directory):
         self.directory = directory
@@ -101,7 +130,7 @@ def spider_closed(self, spider, reason):
         Sends an API request to end the collection's store step.
         """
         # https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
-        if reason != 'finished':
+        if reason != 'finished' or spider.latest:
             return
 
         response = self.client.end_collection_store({
@@ -118,9 +147,9 @@ def item_scraped(self, item, spider):
         """
         Sends an API request to store the file, file item or file error in Kingfisher Process.
         """
-        if not item.get('post_to_api', True):
-            return
 
+        if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem):
+            return
         data = {
             'collection_source': spider.name,
             'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),

diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py
@@ -31,3 +31,7 @@ class FileItem(KingfisherItem):
 
 class FileError(KingfisherItem):
     errors = scrapy.Field()
+
+
+class LatestReleaseDateItem(scrapy.Item):
+    date = scrapy.Field()
diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py
@@ -7,6 +7,7 @@ def scraped(self, item, response, spider):
         """
         Omits an item's `data` value from the log message.
         """
-        item = item.copy()
-        item.pop('data', None)
-        return super().scraped(item, response, spider)
+        if item:
+            item = item.copy()
+            item.pop('data', None)
+            return super().scraped(item, response, spider)
diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
@@ -6,7 +6,7 @@
 from jsonschema import FormatChecker
 from jsonschema.validators import Draft4Validator, RefResolver
 
-from kingfisher_scrapy.items import File, FileItem
+from kingfisher_scrapy.items import File, FileItem, LatestReleaseDateItem
 
 
 def _json_loads(basename):
@@ -40,3 +40,50 @@ def process_item(self, item, spider):
             self.files.add(key)
 
         return item
+
+
+class LatestReleaseDate:
+    def __init__(self):
+        self.processed = set()
+
+    def process_item(self, item, spider):
+        if spider.name in self.processed:
+            spider.crawler.engine.close_spider(self, reason='processed')
+            return
+        if spider.latest and isinstance(item, (File, FileItem)):
+            date = None
+            data = json.loads(item['data'])
+            if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results',
+                                     'release_list', 'release', 'compiled_release'):
+                if item['data_type'] == 'release_package':
+                    data = data['releases']
+                elif item['data_type'] == 'release_package_list':
+                    data = data[0]['releases']
+                elif item['data_type'] == 'release_package_list_in_results':
+                    data = data['results'][0]['releases']
+                if data:
+                    if item['data_type'] in ('release', 'compiled_release'):
+                        date = data['date']
+                    else:
+                        date = max(r['date'] for r in data)
+            elif item['data_type'] in ('record_package', 'record', 'record_list', 'record_package_list',
+                                       'record_package_list_in_results'):
+                if item['data_type'] == 'record_package':
+                    data = data['records']
+                elif item['data_type'] == 'record_package_list':
+                    data = data[0]['records']
+                elif item['data_type'] == 'record_package_list_in_results':
+                    data = data['results'][0]['records']
+                elif item['data_type'] == 'record':
+                    data = [data]
+                if data:
+                    # This assumes that the first record in the record package has the most recent date.
+                    data = data[0]
+                    if 'releases' in data:
+                        date = max(r['date'] for r in data['releases'])
+                    elif 'compiledRelease' in data:
+                        date = data['compiledRelease']['date']
+            self.processed.add(spider.name)
+            return LatestReleaseDateItem({'date': date})
+        else:
+            return item
diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py
@@ -71,6 +71,7 @@
 #}
 EXTENSIONS = {
     'kingfisher_scrapy.extensions.SentryLogging': -1,
+    'kingfisher_scrapy.extensions.KingfisherLatestDate': 1,
     # `KingfisherFilesStore` must run before `KingfisherProcessAPI`, because the file needs to be written before the
     # request is sent to Kingfisher Process.
     'kingfisher_scrapy.extensions.KingfisherFilesStore': 100,
@@ -80,6 +81,7 @@
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
+   'kingfisher_scrapy.pipelines.LatestReleaseDate': 300,
    'kingfisher_scrapy.pipelines.Validate': 300,
 }
 
@@ -131,6 +133,7 @@
 # https://docs.scrapy.org/en/latest/topics/media-pipeline.html#std:setting-FILES_STORE
 FILES_STORE = os.getenv('FILES_STORE', 'data')
 
+KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH = os.getenv('KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH', 'latest_dates')
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html#httperror-allow-all
 HTTPERROR_ALLOW_ALL = True
 

diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py
@@ -16,6 +16,7 @@ class AfghanistanRecords(SimpleSpider):
     """
     name = 'afghanistan_records'
     data_type = 'record'
+    skip_latest_release_date = 'Already covered by afghanistan_releases'
 
     download_delay = 1
 

diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py
@@ -23,5 +23,4 @@ class Australia(LinksSpider):
     def start_requests(self):
         url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
               f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z'
-
         yield scrapy.Request(url, meta={'file_name': 'start.json'})
diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py
@@ -15,10 +15,10 @@ class CanadaBuyAndSell(SimpleSpider):
 
     def start_requests(self):
         urls = [
-            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
-            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
-            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
             'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json',
+            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json',
+            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json',
+            'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
         ]
         if self.sample:
             urls = [urls[0]]

diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py
@@ -63,7 +63,8 @@ def parse_list(self, response):
             # }
             yield from self.handle_item(item)
 
-        if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']:
+        if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']\
+                and not self.sample:
             year = response.request.meta['year']
             month = response.request.meta['month']
             offset = data['pagination']['offset']

diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py
@@ -12,6 +12,7 @@ class ChileCompraRecords(ChileCompraBaseSpider):
     """
     name = 'chile_compra_records'
     data_type = 'record_package'
+    skip_latest_release_date = 'Already covered by chile_compra_releases'
 
     def handle_item(self, item):
         url = 'https://apis.mercadopublico.cl/OCDS/data/record/' + item['ocid'].replace('ocds-70d2nz-', '')

diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py
@@ -8,6 +8,8 @@
 
 
 class DigiwhistBase(BaseSpider):
+    skip_latest_release_date = 'Unordered json lines files'
+
     def start_requests(self):
         # See scrapy.spiders.Spider.start_requests
         for url in self.start_urls:

diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -17,6 +17,7 @@ class DominicanRepublic(BaseSpider):
         Downloads a release package for the oldest year (2018, first link in the downloads page).
     """
     name = 'dominican_republic'
+    skip_latest_release_date = "This command doesn't yet support identifying the latest release in a RAR file."
 
     def start_requests(self):
         yield scrapy.Request(

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -14,6 +14,7 @@ class HondurasONCAE(ZipSpider):
     """
     name = 'honduras_oncae'
     data_type = 'release_package'
+    skip_latest_release_date = 'Already covered by honduras_portal_releases'
 
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900

diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -16,6 +16,7 @@ class HondurasPortalBulkFiles(SimpleSpider):
     """
     name = 'honduras_portal_bulk_files'
     data_type = 'release_package'
+    skip_latest_release_date = 'Already covered by honduras_portal_releases'
 
     def start_requests(self):
         yield scrapy.Request(

diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py
@@ -19,6 +19,7 @@ class HondurasPortalRecords(LinksSpider):
     data_pointer = '/recordPackage'
     next_pointer = '/next'
     next_page_formatter = staticmethod(parameters('page'))
+    skip_latest_release_date = 'Already covered by honduras_portal_releases'
 
     download_delay = 0.9
 

diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py
@@ -12,6 +12,7 @@ class MoldovaOld(SimpleSpider):
     """
     name = 'moldova_old'
     data_type = 'release_package'
+    skip_latest_release_date = 'Old endpoint'
 
     def start_requests(self):
         pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'

diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_records.py b/kingfisher_scrapy/spiders/paraguay_dncp_records.py
@@ -17,6 +17,7 @@ class ParaguayDNCPRecords(ParaguayDNCPBaseSpider):
     """
     name = 'paraguay_dncp_records'
     data_type = 'record_package'
+    skip_latest_release_date = 'Already covered by paraguay_dncp_releases'
 
     def get_files_to_download(self, content):
         for record in content['records']:

diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py
@@ -9,6 +9,7 @@
 class TestFail(SimpleSpider):
     name = 'test_fail'
     data_type = 'release_package'
+    skip_latest_release_date = 'Not a real spider'
 
     def start_requests(self):
         # Fine

diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py
@@ -15,7 +15,7 @@ class UKContractsFinder(SimpleSpider):
     encoding = 'iso-8859-1'
 
     def start_requests(self):
-        url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1'
+        url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=desc&page=1'
         yield self.build_request(url, formatter=parameters('page'), callback=self.parse_list)
 
     @handle_http_error

diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py
@@ -12,6 +12,7 @@ class UruguayRecords(UruguayBase):
     """
     name = 'uruguay_records'
     data_type = 'record_package'
+    skip_latest_release_date = 'Already covered by uruguay_releases'
 
     @handle_http_error
     def parse_list(self, response):