Merge a6327e5 into 4b2ff8b

open-contracting · May 29, 2020 · 0923031 · 0923031
2 parents 4b2ff8b + a6327e5
commit 0923031
Show file tree

Hide file tree

Showing 25 changed files with 174 additions and 81 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -129,14 +129,14 @@ def build_file_item(self, number, data, data_type, url, encoding, file_name):
         })
 
     def build_file_error_from_response(self, response, **kwargs):
-        file_error = {
+        item = FileError({
             'url': response.request.url,
             'errors': {'http_code': response.status},
-        }
+        })
         if 'kf_filename' in response.request.meta:
-            file_error['file_name'] = response.request.meta['kf_filename']
-        file_error.update(kwargs)
-        return FileError(file_error)
+            item['file_name'] = response.request.meta['kf_filename']
+        item.update(kwargs)
+        return item
 
     def _get_package_metadata(self, f, skip_key):
         """

diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py
@@ -7,4 +7,8 @@ class AuthenticationError(KingfisherScrapyError):
 
 
 class SpiderArgumentError(KingfisherScrapyError):
-    """Raises when a spider argument's value is invalid"""
+    """Raised when a spider argument's value is invalid"""
+
+
+class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
+    """Raised when an item is missing a required field"""
diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py
@@ -40,7 +40,7 @@ def item_scraped(self, item, spider):
         metadata = {
             'url': item['url'],
             'data_type': item['data_type'],
-            'encoding': item['encoding'],
+            'encoding': item.get('encoding', 'utf-8'),
         }
         self._write_file(path + '.fileinfo', metadata, spider)
 

diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py
@@ -1,9 +1,26 @@
+# https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 
+from kingfisher_scrapy.exceptions import MissingRequiredFieldError
 
-class File(scrapy.Item):
+
+class KingfisherItem(scrapy.Item):
     file_name = scrapy.Field()
     url = scrapy.Field()
+
+    def validate(self):
+        """
+        Raises an error if any required field is missing.
+
+        :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
+        """
+        if hasattr(self, 'required'):
+            for field in self.required:
+                if field not in self:
+                    raise MissingRequiredFieldError(field)
+
+
+class File(KingfisherItem):
     data = scrapy.Field()
     data_type = scrapy.Field()
     encoding = scrapy.Field()
@@ -15,17 +32,34 @@ class File(scrapy.Item):
     path = scrapy.Field()
     files_store = scrapy.Field()
 
+    required = [
+        'file_name',
+        'url',
+        'data',
+        'data_type',
+    ]
+
 
-class FileItem(scrapy.Item):
+class FileItem(KingfisherItem):
     number = scrapy.Field()
-    file_name = scrapy.Field()
-    url = scrapy.Field()
     data = scrapy.Field()
     data_type = scrapy.Field()
     encoding = scrapy.Field()
 
+    required = [
+        'number',
+        'file_name',
+        'url',
+        'data',
+        'data_type',
+    ]
 
-class FileError(scrapy.Item):
-    file_name = scrapy.Field()
-    url = scrapy.Field()
+
+class FileError(KingfisherItem):
     errors = scrapy.Field()
+
+    required = [
+        'file_name',
+        'url',
+        'errors',
+    ]
diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
 import logging

diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
@@ -2,8 +2,11 @@
 # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
 
 
-class KingfisherScrapyPipeline:
+class Validate:
     def process_item(self, item, spider):
-        item.validate()
+        if hasattr(item, 'validate'):
+            # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
+            # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
+            item.validate()
 
         return item
diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py
@@ -75,9 +75,9 @@
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'kingfisher_scrapy.pipelines.KingfisherScrapyPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'kingfisher_scrapy.pipelines.Validate': 300,
+}
 
 # To send items to Kingfishet Process, set this to, for example, "http://kingfisher.example.com" (no trailing slash).
 KINGFISHER_API_URI = os.getenv('KINGFISHER_API_URI')

diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py
@@ -34,4 +34,4 @@ def parse_list(self, response):
         data = json.loads(response.text)
         for resource in data['result']['resources']:
             if resource['format'].upper() == 'JSON':
-                yield scrapy.Request(url=resource['url'])
+                yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]})
diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py
@@ -17,41 +17,48 @@ def start_requests(self):
         for release_type in release_types:
             yield scrapy.Request(
                 url.format(release_type, page_limit),
-                meta={'release_type': release_type},
+                meta={
+                    'kf_filename': '{}.json'.format(release_type),
+                    'release_type': release_type,
+                },
                 callback=self.parse_list
             )
 
     def parse_list(self, response):
         if self.is_http_success(response):
 
             json_data = json.loads(response.text)
+            release_type = response.request.meta['release_type']
 
             # More Pages?
             if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \
                     and not self.sample:
                 yield scrapy.Request(
                     json_data['links']['next'],
-                    meta={'release_type': response.request.meta['release_type']},
+                    meta={
+                        'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json',
+                        'release_type': release_type,
+                    },
                     callback=self.parse_list
                 )
 
             # Data?
             for release in json_data['releases']:
-                if response.request.meta['release_type'] == 'planning':
+                if release_type == 'planning':
                     uuid = release['tender']['plannedProcurementUUID']
                     yield scrapy.Request(
                         'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid,
                         meta={'kf_filename': 'plannning-%s.json' % uuid},
                         callback=self.parse
                     )
-                if response.request.meta['release_type'] == 'tender':
+                if release_type == 'tender':
                     uuid = release['tender']['RFTUUID']
                     yield scrapy.Request(
                         'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid,
                         meta={'kf_filename': 'tender-%s.json' % uuid},
                         callback=self.parse
                     )
-                if response.request.meta['release_type'] == 'contract':
+                if release_type == 'contract':
                     for award in release['awards']:
                         uuid = award['CNUUID']
                         yield scrapy.Request(

diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py
@@ -25,16 +25,14 @@ def get_year_month_until(self):
             until_month = 12 if self.start_year != datetime.datetime.now().year else until_month
         return until_year, until_month
 
-    def get_sample_request(self):
-        return scrapy.Request(
-            url=self.base_list_url.format(2017, 10, 0, 10),
-            meta={'year': 2017, 'month': 10}
-        )
-
     def start_requests(self):
         if self.sample:
-            yield self.get_sample_request()
+            yield scrapy.Request(
+                url=self.base_list_url.format(2017, 10, 0, 10),
+                meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10},
+            )
             return
+
         until_year, until_month = self.get_year_month_until()
         for year in range(self.start_year, until_year):
             for month in range(1, 13):
@@ -43,7 +41,7 @@ def start_requests(self):
                     break
                 yield scrapy.Request(
                     url=self.base_list_url.format(year, month, 0, self.limit),
-                    meta={'year': year, 'month': month}
+                    meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month},
                 )
 
     def base_parse(self, response, package_type):

diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py
@@ -26,7 +26,8 @@ class ColombiaBulk(ZipSpider):
     def start_requests(self):
         yield scrapy.Request(
             url='https://www.colombiacompra.gov.co/transparencia/datos-json',
-            callback=self.parse_list
+            meta={'kf_filename': 'list.html'},
+            callback=self.parse_list,
         )
 
     @handle_error

diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py
@@ -1,14 +1,21 @@
 import tarfile
 from io import BytesIO
 
+import scrapy
+
 from kingfisher_scrapy.base_spider import BaseSpider
 from kingfisher_scrapy.util import handle_error
 
 
 class DigiwhistBase(BaseSpider):
+    def start_requests(self):
+        # See scrapy.spiders.Spider.start_requests
+        for url in self.start_urls:
+            yield scrapy.Request(url, dont_filter=True, meta={'kf_filename': 'file.tar.gz'})
+
     @handle_error
     def parse(self, response):
-        yield self.build_file_from_response(response, 'file.tar.gz', post_to_api=False)
+        yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False)
 
         # Load a line at the time, pass it to API
         with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar:

diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -15,11 +15,14 @@ class DominicanRepublic(BaseSpider):
     }
 
     def start_requests(self):
-        yield scrapy.Request('https://www.dgcp.gob.do/estandar-mundial-ocds/',
-                             callback=self.parse_main_page)
+        yield scrapy.Request(
+            'https://www.dgcp.gob.do/estandar-mundial-ocds/',
+            meta={'kf_filename': 'list.html'},
+            callback=self.parse_list,
+        )
 
     @handle_error
-    def parse_main_page(self, response):
+    def parse_list(self, response):
         urls = response.css('.fileLink::attr(href)').getall()
         json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls))
 
@@ -28,7 +31,7 @@ def parse_main_page(self, response):
 
         for url in json_urls:
             if '/JSON_DGCP_' in url:
-                yield scrapy.Request('https:' + url)
+                yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
     def parse(self, response):
         if self.is_http_success(response):

diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py
@@ -13,11 +13,12 @@ class France(BaseSpider):
     def start_requests(self):
         yield scrapy.Request(
             url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4',
-            callback=self.parse_item
+            meta={'kf_filename': 'list.json'},
+            callback=self.parse_list,
         )
 
     @handle_error
-    def parse_item(self, response):
+    def parse_list(self, response):
         json_data = json.loads(response.text)
         data = json_data['data']
         for item in data:
@@ -40,7 +41,8 @@ def parse_item(self, response):
             if next_page:
                 yield scrapy.Request(
                     next_page,
-                    callback=self.parse_item
+                    meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'},
+                    callback=self.parse_list
                 )
 
     @handle_error

diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py
@@ -8,7 +8,12 @@
 
 class HondurasCoST(BaseSpider):
     name = 'honduras_cost'
-    start_urls = ['http://app.sisocs.org/protected/ocdsShow/']
+
+    def start_requests(self):
+        yield scrapy.Request(
+            'http://app.sisocs.org/protected/ocdsShow/',
+            meta={'kf_filename': 'list.html'},
+        )
 
     @handle_error
     def parse(self, response):

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -8,11 +8,15 @@
 
 class HondurasONCAE(ZipSpider):
     name = 'honduras_oncae'
-    start_urls = ['http://oncae.gob.hn/datosabiertos']
-
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
 
+    def start_requests(self):
+        yield scrapy.Request(
+            'http://oncae.gob.hn/datosabiertos',
+            meta={'kf_filename': 'list.html'},
+        )
+
     @handle_error
     def parse(self, response):
         urls = response.css(".article-content ul")\

diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -13,19 +13,22 @@ class HondurasPortalBulkFiles(BaseSpider):
     def start_requests(self):
         yield scrapy.Request(
             'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
-            callback=self.parse_json_list
+            meta={'kf_filename': 'list.json'},
+            callback=self.parse_list,
         )
 
     @handle_error
-    def parse_json_list(self, response):
+    def parse_list(self, response):
         filelist = json.loads(response.text)
 
         if self.sample:
-            yield scrapy.Request(filelist[0]['urls']['json'])
+            url = filelist[0]['urls']['json']
+            yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
         else:
             for item in filelist:
-                yield scrapy.Request(item['urls']['json'])
+                url = item['urls']['json']
+                yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
     def parse(self, response):
         filename = urlparse(response.request.url).path.split('/')[-2]

diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py
@@ -39,6 +39,7 @@ def parse_data(self, response):
             if next_page_url:
                 yield scrapy.Request(
                     next_page_url,
+                    meta={'kf_filename': next_page_url.rsplit('/', 1)[-1] + '.json'},
                     callback=self.parse_data
                 )