From 24a6720de4807808d2820da615e9e8e9df71f3cc Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 17:29:07 -0400
Subject: [PATCH 1/6] Add kf_filename meta to all initial requests

---
 kingfisher_scrapy/spiders/chile_base.py            | 14 ++++++--------
 kingfisher_scrapy/spiders/colombia_bulk.py         |  3 ++-
 kingfisher_scrapy/spiders/digiwhist_base.py        |  9 ++++++++-
 kingfisher_scrapy/spiders/dominican_republic.py    |  9 ++++++---
 kingfisher_scrapy/spiders/france.py                |  5 +++--
 kingfisher_scrapy/spiders/honduras_cost.py         |  7 ++++++-
 kingfisher_scrapy/spiders/honduras_oncae.py        |  8 ++++++--
 .../spiders/honduras_portal_bulk_files.py          |  5 +++--
 kingfisher_scrapy/spiders/nepal_dhangadhi.py       |  5 +++--
 kingfisher_scrapy/spiders/nigeria_portal.py        |  7 ++++++-
 kingfisher_scrapy/spiders/uruguay_historical.py    |  4 +---
 11 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py
index b04d113a4..8c242de0d 100644
--- a/kingfisher_scrapy/spiders/chile_base.py
+++ b/kingfisher_scrapy/spiders/chile_base.py
@@ -25,16 +25,14 @@ def get_year_month_until(self):
             until_month = 12 if self.start_year != datetime.datetime.now().year else until_month
         return until_year, until_month
 
-    def get_sample_request(self):
-        return scrapy.Request(
-            url=self.base_list_url.format(2017, 10, 0, 10),
-            meta={'year': 2017, 'month': 10}
-        )
-
     def start_requests(self):
         if self.sample:
-            yield self.get_sample_request()
+            yield scrapy.Request(
+                url=self.base_list_url.format(2017, 10, 0, 10),
+                meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10},
+            )
             return
+
         until_year, until_month = self.get_year_month_until()
         for year in range(self.start_year, until_year):
             for month in range(1, 13):
@@ -43,7 +41,7 @@ def start_requests(self):
                     break
                 yield scrapy.Request(
                     url=self.base_list_url.format(year, month, 0, self.limit),
-                    meta={'year': year, 'month': month}
+                    meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month},
                 )
 
     def base_parse(self, response, package_type):
diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py
index 8b5dcc755..874b3c97e 100644
--- a/kingfisher_scrapy/spiders/colombia_bulk.py
+++ b/kingfisher_scrapy/spiders/colombia_bulk.py
@@ -26,7 +26,8 @@ class ColombiaBulk(ZipSpider):
     def start_requests(self):
         yield scrapy.Request(
             url='https://www.colombiacompra.gov.co/transparencia/datos-json',
-            callback=self.parse_list
+            meta={'kf_filename': 'list.html'},
+            callback=self.parse_list,
         )
 
     @handle_error
diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py
index c587e8186..a35d418c7 100644
--- a/kingfisher_scrapy/spiders/digiwhist_base.py
+++ b/kingfisher_scrapy/spiders/digiwhist_base.py
@@ -1,14 +1,21 @@
 import tarfile
 from io import BytesIO
 
+import scrapy
+
 from kingfisher_scrapy.base_spider import BaseSpider
 from kingfisher_scrapy.util import handle_error
 
 
 class DigiwhistBase(BaseSpider):
+    def start_requests(self):
+        # See scrapy.spiders.Spider.start_requests
+        for url in self.start_urls:
+            yield scrapy.Request(url, dont_filter=True, meta={'kf_filename': 'file.tar.gz'})
+
     @handle_error
     def parse(self, response):
-        yield self.build_file_from_response(response, 'file.tar.gz', post_to_api=False)
+        yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False)
 
         # Load a line at the time, pass it to API
         with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar:
diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
index ac4a14e80..fe732e0e5 100644
--- a/kingfisher_scrapy/spiders/dominican_republic.py
+++ b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -15,11 +15,14 @@ class DominicanRepublic(BaseSpider):
     }
 
     def start_requests(self):
-        yield scrapy.Request('https://www.dgcp.gob.do/estandar-mundial-ocds/',
-                             callback=self.parse_main_page)
+        yield scrapy.Request(
+            'https://www.dgcp.gob.do/estandar-mundial-ocds/',
+            meta={'kf_filename': 'list.html'},
+            callback=self.parse_list,
+        )
 
     @handle_error
-    def parse_main_page(self, response):
+    def parse_list(self, response):
         urls = response.css('.fileLink::attr(href)').getall()
         json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls))
 
diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py
index 4d7fcc13f..bf2989ddb 100644
--- a/kingfisher_scrapy/spiders/france.py
+++ b/kingfisher_scrapy/spiders/france.py
@@ -13,11 +13,12 @@ class France(BaseSpider):
     def start_requests(self):
         yield scrapy.Request(
             url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4',
-            callback=self.parse_item
+            meta={'kf_filename': 'list.json'},
+            callback=self.parse_list,
         )
 
     @handle_error
-    def parse_item(self, response):
+    def parse_list(self, response):
         json_data = json.loads(response.text)
         data = json_data['data']
         for item in data:
diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py
index 950c2e141..6d3fb9fda 100644
--- a/kingfisher_scrapy/spiders/honduras_cost.py
+++ b/kingfisher_scrapy/spiders/honduras_cost.py
@@ -8,7 +8,12 @@
 
 class HondurasCoST(BaseSpider):
     name = 'honduras_cost'
-    start_urls = ['http://app.sisocs.org/protected/ocdsShow/']
+
+    def start_requests(self):
+        yield scrapy.Request(
+            'http://app.sisocs.org/protected/ocdsShow/',
+            meta={'kf_filename': 'list.html'},
+        )
 
     @handle_error
     def parse(self, response):
diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index 7912b3728..cd9672006 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -8,11 +8,15 @@
 
 class HondurasONCAE(ZipSpider):
     name = 'honduras_oncae'
-    start_urls = ['http://oncae.gob.hn/datosabiertos']
-
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
 
+    def start_requests(self):
+        yield scrapy.Request(
+            'http://oncae.gob.hn/datosabiertos',
+            meta={'kf_filename': 'list.html'},
+        )
+
     @handle_error
     def parse(self, response):
         urls = response.css(".article-content ul")\
diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
index 01efbed89..889624f5c 100644
--- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
+++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -13,11 +13,12 @@ class HondurasPortalBulkFiles(BaseSpider):
     def start_requests(self):
         yield scrapy.Request(
             'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
-            callback=self.parse_json_list
+            meta={'kf_filename': 'list.json'},
+            callback=self.parse_list,
         )
 
     @handle_error
-    def parse_json_list(self, response):
+    def parse_list(self, response):
         filelist = json.loads(response.text)
 
         if self.sample:
diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py
index 9cb5536bf..342e9d208 100644
--- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py
+++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py
@@ -13,11 +13,12 @@ class NepalDhangadhi(BaseSpider):
     def start_requests(self):
         yield scrapy.Request(
             'https://admin.ims.susasan.org/api/static-data/dhangadhi',
-            callback=self.parse_item,
+            meta={'kf_filename': 'list.json'},
+            callback=self.parse_list,
         )
 
     @handle_error
-    def parse_item(self, response):
+    def parse_list(self, response):
         url = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json'
         json_data = json.loads(response.text)
         fiscal_years = json_data['data']['fiscal_years']
diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py
index f5a2b5db7..8b532503f 100644
--- a/kingfisher_scrapy/spiders/nigeria_portal.py
+++ b/kingfisher_scrapy/spiders/nigeria_portal.py
@@ -8,10 +8,15 @@
 
 class NigeriaPortal(BaseSpider):
     name = 'nigeria_portal'
-    start_urls = ['http://nocopo.bpp.gov.ng/OpenData.aspx']
     download_delay = 0.9
     user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'  # noqa: E501
 
+    def start_requests(self):
+        yield scrapy.Request(
+            'http://nocopo.bpp.gov.ng/OpenData.aspx',
+            meta={'kf_filename': 'list.html'},
+        )
+
     @handle_error
     def parse(self, response):
         formdata = {
diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py
index cb7b77901..e7c0b96d0 100644
--- a/kingfisher_scrapy/spiders/uruguay_historical.py
+++ b/kingfisher_scrapy/spiders/uruguay_historical.py
@@ -23,6 +23,4 @@ def start_requests(self):
         if self.sample:
             end_year = 2003
         for year in range(2002, end_year):
-            yield scrapy.Request(
-                url=base_url.format(year)
-            )
+            yield scrapy.Request(base_url.format(year), meta={'kf_filename': 'OCDS-{}.zip'.format(year)})

From 9dbc0ea1459daa7a0b91b25b53b3ac5b5c17d43e Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 17:30:01 -0400
Subject: [PATCH 2/6] Test that, if an initial request errors, it returns a
 FileError item with a file_name key

---
 tests/test_spiders.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_spiders.py b/tests/test_spiders.py
index d6c68a88a..4cb4c5478 100644
--- a/tests/test_spiders.py
+++ b/tests/test_spiders.py
@@ -35,8 +35,9 @@ def test_start_requests_http_error(spider_name):
             assert len(items) == 1
             for item in items:
                 assert isinstance(item, FileError)
-                assert len(item) <= 3
+                assert len(item) == 3
                 assert item['errors'] == {'http_code': 555}
+                assert item['file_name']
                 assert item['url']
     except CloseSpider as e:
         warnings.warn('{}: {}'.format(spidercls.name, e.reason))
@@ -46,4 +47,4 @@ def test_start_requests_http_error(spider_name):
 def test_start_urls_start_requests(spider_name):
     spidercls = runner.spider_loader.load(spider_name)
 
-    assert hasattr(spidercls, 'start_urls') ^ method_is_overridden(spidercls, scrapy.Spider, 'start_requests')
+    assert 'start_urls' not in spidercls.__dict__ or 'start_requests' not in spidercls.__dict__

From e9441b5623f7caee6d90dcdf7ed938ad093419b5 Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 17:32:03 -0400
Subject: [PATCH 3/6] Add validation for required fields in items

---
 kingfisher_scrapy/base_spider.py | 10 +++----
 kingfisher_scrapy/exceptions.py  |  6 +++-
 kingfisher_scrapy/extensions.py  |  2 +-
 kingfisher_scrapy/items.py       | 48 +++++++++++++++++++++++++++-----
 kingfisher_scrapy/middlewares.py |  5 ----
 kingfisher_scrapy/pipelines.py   |  7 +++--
 kingfisher_scrapy/settings.py    |  6 ++--
 7 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
index 7cc90d4d0..b97dbec2e 100644
--- a/kingfisher_scrapy/base_spider.py
+++ b/kingfisher_scrapy/base_spider.py
@@ -129,14 +129,14 @@ def build_file_item(self, number, data, data_type, url, encoding, file_name):
         })
 
     def build_file_error_from_response(self, response, **kwargs):
-        file_error = {
+        item = FileError({
             'url': response.request.url,
             'errors': {'http_code': response.status},
-        }
+        })
         if 'kf_filename' in response.request.meta:
-            file_error['file_name'] = response.request.meta['kf_filename']
-        file_error.update(kwargs)
-        return FileError(file_error)
+            item['file_name'] = response.request.meta['kf_filename']
+        item.update(kwargs)
+        return item
 
     def _get_package_metadata(self, f, skip_key):
         """
diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py
index 04f1a6c08..da83a3a5f 100644
--- a/kingfisher_scrapy/exceptions.py
+++ b/kingfisher_scrapy/exceptions.py
@@ -7,4 +7,8 @@ class AuthenticationError(KingfisherScrapyError):
 
 
 class SpiderArgumentError(KingfisherScrapyError):
-    """Raises when a spider argument's value is invalid"""
+    """Raised when a spider argument's value is invalid"""
+
+
+class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
+    """Raised when an item is missing a required field"""
diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py
index 10d77e243..297dd9afe 100644
--- a/kingfisher_scrapy/extensions.py
+++ b/kingfisher_scrapy/extensions.py
@@ -40,7 +40,7 @@ def item_scraped(self, item, spider):
         metadata = {
             'url': item['url'],
             'data_type': item['data_type'],
-            'encoding': item['encoding'],
+            'encoding': item.get('encoding', 'utf-8'),
         }
         self._write_file(path + '.fileinfo', metadata, spider)
 
diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py
index 29c2fef17..a05d5c2bf 100644
--- a/kingfisher_scrapy/items.py
+++ b/kingfisher_scrapy/items.py
@@ -1,9 +1,26 @@
+# https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 
+from kingfisher_scrapy.exceptions import MissingRequiredFieldError
 
-class File(scrapy.Item):
+
+class KingfisherItem(scrapy.Item):
     file_name = scrapy.Field()
     url = scrapy.Field()
+
+    def validate(self):
+        """
+        Raises an error if any required field is missing.
+
+        :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
+        """
+        if hasattr(self, 'required'):
+            for field in self.required:
+                if field not in self:
+                    raise MissingRequiredFieldError(field)
+
+
+class File(KingfisherItem):
     data = scrapy.Field()
     data_type = scrapy.Field()
     encoding = scrapy.Field()
@@ -15,17 +32,34 @@ class File(scrapy.Item):
     path = scrapy.Field()
     files_store = scrapy.Field()
 
+    required = [
+        'file_name',
+        'url',
+        'data',
+        'data_type',
+    ]
+
 
-class FileItem(scrapy.Item):
+class FileItem(KingfisherItem):
     number = scrapy.Field()
-    file_name = scrapy.Field()
-    url = scrapy.Field()
     data = scrapy.Field()
     data_type = scrapy.Field()
     encoding = scrapy.Field()
 
+    required = [
+        'number',
+        'file_name',
+        'url',
+        'data',
+        'data_type',
+    ]
 
-class FileError(scrapy.Item):
-    file_name = scrapy.Field()
-    url = scrapy.Field()
+
+class FileError(KingfisherItem):
     errors = scrapy.Field()
+
+    required = [
+        'file_name',
+        'url',
+        'errors',
+    ]
diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py
index 1d8aba0c2..b0e7cc5e5 100644
--- a/kingfisher_scrapy/middlewares.py
+++ b/kingfisher_scrapy/middlewares.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
 import logging
diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
index 5ab05e299..2fc034159 100644
--- a/kingfisher_scrapy/pipelines.py
+++ b/kingfisher_scrapy/pipelines.py
@@ -2,8 +2,11 @@
 # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
 
 
-class KingfisherScrapyPipeline:
+class Validate:
     def process_item(self, item, spider):
-        item.validate()
+        if hasattr(item, 'validate'):
+            # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
+            # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
+            item.validate()
 
         return item
diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py
index a37532416..53fc8d389 100644
--- a/kingfisher_scrapy/settings.py
+++ b/kingfisher_scrapy/settings.py
@@ -75,9 +75,9 @@
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'kingfisher_scrapy.pipelines.KingfisherScrapyPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'kingfisher_scrapy.pipelines.Validate': 300,
+}
 
 # To send items to Kingfishet Process, set this to, for example, "http://kingfisher.example.com" (no trailing slash).
 KINGFISHER_API_URI = os.getenv('KINGFISHER_API_URI')

From 78c0aa405b1c8713d1715215159d6f30e081dd1d Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 17:34:35 -0400
Subject: [PATCH 4/6] flake8

---
 tests/test_spiders.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_spiders.py b/tests/test_spiders.py
index 4cb4c5478..1a4879765 100644
--- a/tests/test_spiders.py
+++ b/tests/test_spiders.py
@@ -1,11 +1,9 @@
 import warnings
 
 import pytest
-import scrapy
 from scrapy.crawler import Crawler, CrawlerRunner
 from scrapy.exceptions import CloseSpider
 from scrapy.http import Response
-from scrapy.utils.deprecate import method_is_overridden
 from scrapy.utils.project import get_project_settings
 
 from kingfisher_scrapy.items import FileError

From a6327e504d43a895b3b9e56fa503a12afd7272d8 Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 18:20:12 -0400
Subject: [PATCH 5/6] Add kf_filename meta to all requests that can yield file
 errors

---
 .../spiders/argentina_buenos_aires.py         |  2 +-
 kingfisher_scrapy/spiders/australia_nsw.py    | 17 ++++++++----
 .../spiders/dominican_republic.py             |  2 +-
 kingfisher_scrapy/spiders/france.py           |  3 ++-
 .../spiders/honduras_portal_bulk_files.py     |  6 +++--
 .../spiders/indonesia_bandung.py              |  1 +
 kingfisher_scrapy/spiders/openopps.py         | 27 +++++++++++--------
 .../spiders/paraguay_dncp_base.py             | 11 ++++++--
 .../spiders/paraguay_hacienda.py              | 19 ++++++++++---
 9 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py
index a434a0164..d1847e3d6 100644
--- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py
+++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py
@@ -34,4 +34,4 @@ def parse_list(self, response):
         data = json.loads(response.text)
         for resource in data['result']['resources']:
             if resource['format'].upper() == 'JSON':
-                yield scrapy.Request(url=resource['url'])
+                yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]})
diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py
index d2313d454..7a3d53478 100644
--- a/kingfisher_scrapy/spiders/australia_nsw.py
+++ b/kingfisher_scrapy/spiders/australia_nsw.py
@@ -17,7 +17,10 @@ def start_requests(self):
         for release_type in release_types:
             yield scrapy.Request(
                 url.format(release_type, page_limit),
-                meta={'release_type': release_type},
+                meta={
+                    'kf_filename': '{}.json'.format(release_type),
+                    'release_type': release_type,
+                },
                 callback=self.parse_list
             )
 
@@ -25,33 +28,37 @@ def parse_list(self, response):
         if self.is_http_success(response):
 
             json_data = json.loads(response.text)
+            release_type = response.request.meta['release_type']
 
             # More Pages?
             if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \
                     and not self.sample:
                 yield scrapy.Request(
                     json_data['links']['next'],
-                    meta={'release_type': response.request.meta['release_type']},
+                    meta={
+                        'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json',
+                        'release_type': release_type,
+                    },
                     callback=self.parse_list
                 )
 
             # Data?
             for release in json_data['releases']:
-                if response.request.meta['release_type'] == 'planning':
+                if release_type == 'planning':
                     uuid = release['tender']['plannedProcurementUUID']
                     yield scrapy.Request(
                         'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid,
                         meta={'kf_filename': 'plannning-%s.json' % uuid},
                         callback=self.parse
                     )
-                if response.request.meta['release_type'] == 'tender':
+                if release_type == 'tender':
                     uuid = release['tender']['RFTUUID']
                     yield scrapy.Request(
                         'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid,
                         meta={'kf_filename': 'tender-%s.json' % uuid},
                         callback=self.parse
                     )
-                if response.request.meta['release_type'] == 'contract':
+                if release_type == 'contract':
                     for award in release['awards']:
                         uuid = award['CNUUID']
                         yield scrapy.Request(
diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py
index fe732e0e5..7962a749d 100644
--- a/kingfisher_scrapy/spiders/dominican_republic.py
+++ b/kingfisher_scrapy/spiders/dominican_republic.py
@@ -31,7 +31,7 @@ def parse_list(self, response):
 
         for url in json_urls:
             if '/JSON_DGCP_' in url:
-                yield scrapy.Request('https:' + url)
+                yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
     def parse(self, response):
         if self.is_http_success(response):
diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py
index bf2989ddb..203b9ba0e 100644
--- a/kingfisher_scrapy/spiders/france.py
+++ b/kingfisher_scrapy/spiders/france.py
@@ -41,7 +41,8 @@ def parse_list(self, response):
             if next_page:
                 yield scrapy.Request(
                     next_page,
-                    callback=self.parse_item
+                    meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'},
+                    callback=self.parse_list
                 )
 
     @handle_error
diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
index 889624f5c..e60582bae 100644
--- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
+++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -22,11 +22,13 @@ def parse_list(self, response):
         filelist = json.loads(response.text)
 
         if self.sample:
-            yield scrapy.Request(filelist[0]['urls']['json'])
+            url = filelist[0]['urls']['json']
+            yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
         else:
             for item in filelist:
-                yield scrapy.Request(item['urls']['json'])
+                url = item['urls']['json']
+                yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})
 
     def parse(self, response):
         filename = urlparse(response.request.url).path.split('/')[-2]
diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py
index cec93b1e1..0b6c7a683 100644
--- a/kingfisher_scrapy/spiders/indonesia_bandung.py
+++ b/kingfisher_scrapy/spiders/indonesia_bandung.py
@@ -39,6 +39,7 @@ def parse_data(self, response):
             if next_page_url:
                 yield scrapy.Request(
                     next_page_url,
+                    meta={'kf_filename': next_page_url.rsplit('/', 1)[-1] + '.json'},
                     callback=self.parse_data
                 )
 
diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py
index 930edc130..79bb8bb59 100644
--- a/kingfisher_scrapy/spiders/openopps.py
+++ b/kingfisher_scrapy/spiders/openopps.py
@@ -175,9 +175,13 @@ def parse(self, response):
                 next_url = results.get('next')
                 if next_url:
                     yield scrapy.Request(
-                        url=next_url,
-                        headers={"Accept": "*/*", "Content-Type": "application/json"},
-                        meta={"release_date": release_date, "search_h": search_h},
+                        next_url,
+                        meta={
+                            'kf_filename': hashlib.md5(next_url.encode('utf-8')).hexdigest() + '.json',
+                            'release_date': release_date,
+                            'search_h': search_h,
+                        },
+                        headers={'Accept': '*/*', 'Content-Type': 'application/json'}
                     )
 
                 # Tells if we have to re-authenticate before the token expires
@@ -221,15 +225,16 @@ def parse(self, response):
 
                 self.logger.info('Changing filters, split in {}: {}.'.format(parts, response.request.url))
                 for i in range(len(start_hour_list)):
+                    url = self.base_page_url.format(start_hour_list[i], end_hour_list[i])
                     yield scrapy.Request(
-                        url=self.base_page_url.format(
-                            start_hour_list[i],
-                            end_hour_list[i]
-                        ),
-                        headers={"Accept": "*/*", "Content-Type": "application/json"},
-                        meta={"release_date": start_hour_list[i],  # release_date with star hour
-                              "last_hour": end_hour_list[i],  # release_date with last hour
-                              "search_h": split_h},  # new search range
+                        url,
+                        meta={
+                            'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json',
+                            'release_date': start_hour_list[i],  # release_date with star hour
+                            'last_hour': end_hour_list[i],  # release_date with last hour
+                            'search_h': split_h,  # new search range
+                        },
+                        headers={'Accept': '*/*', 'Content-Type': 'application/json'}
                     )
         else:
             # Message for pages that exceed the 10,000 search results in the range of one hour
diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py
index 1725e0504..69b2d4a27 100644
--- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py
+++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py
@@ -51,10 +51,15 @@ def from_crawler(cls, crawler, *args, **kwargs):
 
     def start_requests(self):
         if self.from_date:
+            from_date = self.from_date.strftime(self.date_format)
             self.base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}'\
-                .format(self.base_url, self.from_date.strftime(self.date_format))
+                .format(self.base_url, from_date)
         yield scrapy.Request(
             self.base_page_url,
+            meta={
+                'kf_filename': '{}-1.json'.format(from_date),
+                'from_date': from_date,
+            },
             # send duplicate requests when the token expired and in the continuation of last_request saved.
             dont_filter=True,
             callback=self.parse_pages
@@ -123,9 +128,11 @@ def parse_pages(self, response):
             )
         pagination = content['pagination']
         if pagination['current_page'] < pagination['total_pages'] and not self.sample:
-            url = '{}&page={}'.format(self.base_page_url, pagination['current_page'] + 1)
+            page = pagination['current_page'] + 1
+            url = '{}&page={}'.format(self.base_page_url, page)
             yield scrapy.Request(
                 url,
+                meta={'kf_filename': '{}-{}.json'.format(response.request.meta['from_date'], page)},
                 dont_filter=True,
                 callback=self.parse_pages
             )
diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py
index cbc98cf4e..b0db40f4d 100644
--- a/kingfisher_scrapy/spiders/paraguay_hacienda.py
+++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py
@@ -44,9 +44,13 @@ def start_requests(self):
         # so we first iterate over this list that is paginated
         yield scrapy.Request(
             self.base_list_url.format(1),
+            meta={
+                'kf_filename': 'list-1.json',
+                'meta': True,
+                'first': True,
+            },
             # send duplicate requests when the token expired and in the continuation of last_request saved.
             dont_filter=True,
-            meta={'meta': True, 'first': True}
         )
 
     @handle_error
@@ -60,7 +64,11 @@ def parse(self, response):
             for page in range(2,  total_pages+1):
                 yield scrapy.Request(
                     url=self.base_list_url.format(page),
-                    meta={'meta': True, 'first': False},
+                    meta={
+                        'kf_filename': 'list-{}.json'.format(page),
+                        'meta': True,
+                        'first': False,
+                    },
                     dont_filter=True
                 )
 
@@ -76,8 +84,11 @@ def parse(self, response):
                     self.release_ids.append(row['idLlamado'])
                     yield scrapy.Request(
                         url=base_url.format(row['idLlamado']),
-                        meta={'meta': False, 'first': False,
-                              'kf_filename': 'release-{}.json'.format(row['idLlamado'])},
+                        meta={
+                            'kf_filename': 'release-{}.json'.format(row['idLlamado']),
+                            'meta': False,
+                            'first': False,
+                        },
                         dont_filter=True
                     )
         else:

From b2d0b88ca49540b552d93241d1d4605956f78a6c Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Fri, 29 May 2020 18:29:27 -0400
Subject: [PATCH 6/6] Add test for Validate item pipeline

---
 tests/test_validate.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 tests/test_validate.py

diff --git a/tests/test_validate.py b/tests/test_validate.py
new file mode 100644
index 000000000..641c73e9d
--- /dev/null
+++ b/tests/test_validate.py
@@ -0,0 +1,24 @@
+import pytest
+from kingfisher_scrapy.exceptions import MissingRequiredFieldError
+from kingfisher_scrapy.items import File
+from kingfisher_scrapy.pipelines import Validate
+
+
+def test_process_item():
+    pipeline = Validate()
+    item = File({
+        'file_name': '',
+        'data': '',
+        'data_type': '',
+        'url': '',
+    })
+
+    assert pipeline.process_item(item, None) == item
+
+
+def test_process_item_error():
+    pipeline = Validate()
+    item = File()
+
+    with pytest.raises(MissingRequiredFieldError):
+        pipeline.process_item(item, None)