diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index badf778d2..f9ef1a168 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -45,7 +45,7 @@ class BaseSpider(scrapy.Spider): VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'} def __init__(self, sample=None, note=None, from_date=None, until_date=None, - date_format='date', *args, **kwargs): + date_format='date', latest=None, *args, **kwargs): super().__init__(*args, **kwargs) # https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments @@ -54,12 +54,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, self.from_date = from_date self.until_date = until_date self.date_format = self.VALID_DATE_FORMATS[date_format] + self.latest = latest == 'true' spider_arguments = { 'sample': sample, 'note': note, 'from_date': from_date, 'until_date': until_date, + 'latest': latest, } spider_arguments.update(kwargs) self.logger.info('Spider arguments: {!r}'.format(spider_arguments)) @@ -304,6 +306,7 @@ def start_requests(self): encoding = 'utf-8' zip_file_format = None + skip_latest_release_date = "This command doesn't yet support identifying the latest release in a ZIP file." @handle_http_error def parse(self, response): diff --git a/kingfisher_scrapy/commands/latestreleasedate.py b/kingfisher_scrapy/commands/latestreleasedate.py new file mode 100644 index 000000000..e48f7484c --- /dev/null +++ b/kingfisher_scrapy/commands/latestreleasedate.py @@ -0,0 +1,34 @@ +import os +from datetime import datetime + +from scrapy.commands import ScrapyCommand +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + + +class LatestReleaseDatePerPublisher(ScrapyCommand): + def short_desc(self): + return 'Get the latest published release date per publisher' + + def run(self, args, opts): + settings = get_project_settings() + settings.set('CLOSESPIDER_ITEMCOUNT', 1) + settings.set('CONCURRENT_REQUESTS', 1) + settings.set('CLOSESPIDER_ERRORCOUNT', 1) + + path = settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH'] + os.makedirs(path, exist_ok=True) + os.unlink(os.path.join(path, 'latest_dates.csv')) + filename = os.path.join(path, 'skipped_spiders.txt') + + process = CrawlerProcess(settings=settings) + spiders = process.spider_loader.list() + current_year = datetime.today().year + with open(filename, 'w') as output: + for spider in spiders: + spider_cls = process.spider_loader.load(spider) + if hasattr(spider_cls, 'skip_latest_release_date'): + output.write(f'Skipping {spider}. Reason: {spider_cls.skip_latest_release_date}\n') + else: + process.crawl(spider, latest='true', year=current_year) + process.start() diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index a812fdd23..0cada7dae 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -7,10 +7,39 @@ from scrapy import signals from scrapy.exceptions import NotConfigured -from kingfisher_scrapy.items import File, FileError, FileItem +from kingfisher_scrapy.items import File, FileError, FileItem, LatestReleaseDateItem from kingfisher_scrapy.kingfisher_process import Client +# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension +class KingfisherLatestDate: + def __init__(self, filename): + self.filename = filename + self.spiders_seen = set() + + @classmethod + def from_crawler(cls, crawler): + path = crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH'] + os.makedirs(path, exist_ok=True) + filename = os.path.join(path, 'latest_dates.csv') + extension = cls(filename=filename) + crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) + crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed) + return extension + + def item_scraped(self, item, spider): + if not isinstance(item, LatestReleaseDateItem) or spider.name in self.spiders_seen: + return + self.spiders_seen.add(spider.name) + with open(self.filename, 'a+') as output: + output.write(f"{spider.name},{item['date']}\n") + + def spider_closed(self, spider, reason): + if spider.name not in self.spiders_seen: + with open(self.filename, 'a+') as output: + output.write(f"{spider.name},{reason}\n") + + class KingfisherFilesStore: def __init__(self, directory): self.directory = directory @@ -101,7 +130,7 @@ def spider_closed(self, spider, reason): Sends an API request to end the collection's store step. """ # https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed - if reason != 'finished': + if reason != 'finished' or spider.latest: return response = self.client.end_collection_store({ @@ -118,9 +147,9 @@ def item_scraped(self, item, spider): """ Sends an API request to store the file, file item or file error in Kingfisher Process. """ - if not item.get('post_to_api', True): - return + if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem): + return data = { 'collection_source': spider.name, 'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'), diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index 7ce0e95e5..03e9692ac 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -31,3 +31,7 @@ class FileItem(KingfisherItem): class FileError(KingfisherItem): errors = scrapy.Field() + + +class LatestReleaseDateItem(scrapy.Item): + date = scrapy.Field() diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py index a7298837b..cc8fcb19f 100644 --- a/kingfisher_scrapy/log_formatter.py +++ b/kingfisher_scrapy/log_formatter.py @@ -7,6 +7,7 @@ def scraped(self, item, response, spider): """ Omits an item's `data` value from the log message. """ - item = item.copy() - item.pop('data', None) - return super().scraped(item, response, spider) + if item: + item = item.copy() + item.pop('data', None) + return super().scraped(item, response, spider) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index b0935f8fb..f1d918a60 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -6,7 +6,7 @@ from jsonschema import FormatChecker from jsonschema.validators import Draft4Validator, RefResolver -from kingfisher_scrapy.items import File, FileItem +from kingfisher_scrapy.items import File, FileItem, LatestReleaseDateItem def _json_loads(basename): @@ -40,3 +40,50 @@ def process_item(self, item, spider): self.files.add(key) return item + + +class LatestReleaseDate: + def __init__(self): + self.processed = set() + + def process_item(self, item, spider): + if spider.name in self.processed: + spider.crawler.engine.close_spider(self, reason='processed') + return + if spider.latest and isinstance(item, (File, FileItem)): + date = None + data = json.loads(item['data']) + if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results', + 'release_list', 'release', 'compiled_release'): + if item['data_type'] == 'release_package': + data = data['releases'] + elif item['data_type'] == 'release_package_list': + data = data[0]['releases'] + elif item['data_type'] == 'release_package_list_in_results': + data = data['results'][0]['releases'] + if data: + if item['data_type'] in ('release', 'compiled_release'): + date = data['date'] + else: + date = max(r['date'] for r in data) + elif item['data_type'] in ('record_package', 'record', 'record_list', 'record_package_list', + 'record_package_list_in_results'): + if item['data_type'] == 'record_package': + data = data['records'] + elif item['data_type'] == 'record_package_list': + data = data[0]['records'] + elif item['data_type'] == 'record_package_list_in_results': + data = data['results'][0]['records'] + elif item['data_type'] == 'record': + data = [data] + if data: + # This assumes that the first record in the record package has the most recent date. + data = data[0] + if 'releases' in data: + date = max(r['date'] for r in data['releases']) + elif 'compiledRelease' in data: + date = data['compiledRelease']['date'] + self.processed.add(spider.name) + return LatestReleaseDateItem({'date': date}) + else: + return item diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index dd784a3a4..ece766ad0 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -71,6 +71,7 @@ #} EXTENSIONS = { 'kingfisher_scrapy.extensions.SentryLogging': -1, + 'kingfisher_scrapy.extensions.KingfisherLatestDate': 1, # `KingfisherFilesStore` must run before `KingfisherProcessAPI`, because the file needs to be written before the # request is sent to Kingfisher Process. 'kingfisher_scrapy.extensions.KingfisherFilesStore': 100, @@ -80,6 +81,7 @@ # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { + 'kingfisher_scrapy.pipelines.LatestReleaseDate': 300, 'kingfisher_scrapy.pipelines.Validate': 300, } @@ -131,6 +133,7 @@ # https://docs.scrapy.org/en/latest/topics/media-pipeline.html#std:setting-FILES_STORE FILES_STORE = os.getenv('FILES_STORE', 'data') +KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH = os.getenv('KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH', 'latest_dates') # https://docs.scrapy.org/en/latest/topics/spider-middleware.html#httperror-allow-all HTTPERROR_ALLOW_ALL = True diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index 9e5060e7d..bed722415 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -16,6 +16,7 @@ class AfghanistanRecords(SimpleSpider): """ name = 'afghanistan_records' data_type = 'record' + skip_latest_release_date = 'Already covered by afghanistan_releases' download_delay = 1 diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index 593cd3a72..1ed5b0c10 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -23,5 +23,4 @@ class Australia(LinksSpider): def start_requests(self): url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \ f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z' - yield scrapy.Request(url, meta={'file_name': 'start.json'}) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index 416f78d61..948be2b7a 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -15,10 +15,10 @@ class CanadaBuyAndSell(SimpleSpider): def start_requests(self): urls = [ - 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', - 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', - 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', ] if self.sample: urls = [urls[0]] diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 57e468304..876a51bec 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -63,7 +63,8 @@ def parse_list(self, response): # } yield from self.handle_item(item) - if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: + if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']\ + and not self.sample: year = response.request.meta['year'] month = response.request.meta['month'] offset = data['pagination']['offset'] diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py index a40596233..62143338f 100644 --- a/kingfisher_scrapy/spiders/chile_compra_records.py +++ b/kingfisher_scrapy/spiders/chile_compra_records.py @@ -12,6 +12,7 @@ class ChileCompraRecords(ChileCompraBaseSpider): """ name = 'chile_compra_records' data_type = 'record_package' + skip_latest_release_date = 'Already covered by chile_compra_releases' def handle_item(self, item): url = 'https://apis.mercadopublico.cl/OCDS/data/record/' + item['ocid'].replace('ocds-70d2nz-', '') diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index a19e15bab..124fdc20f 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -8,6 +8,8 @@ class DigiwhistBase(BaseSpider): + skip_latest_release_date = 'Unordered json lines files' + def start_requests(self): # See scrapy.spiders.Spider.start_requests for url in self.start_urls: diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index e6a41baf2..c52354979 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -17,6 +17,7 @@ class DominicanRepublic(BaseSpider): Downloads a release package for the oldest year (2018, first link in the downloads page). """ name = 'dominican_republic' + skip_latest_release_date = "This command doesn't yet support identifying the latest release in a RAR file." def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 8ee680ac5..7d898e139 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -14,6 +14,7 @@ class HondurasONCAE(ZipSpider): """ name = 'honduras_oncae' data_type = 'release_package' + skip_latest_release_date = 'Already covered by honduras_portal_releases' # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 2ba869669..0593d3f3c 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -16,6 +16,7 @@ class HondurasPortalBulkFiles(SimpleSpider): """ name = 'honduras_portal_bulk_files' data_type = 'release_package' + skip_latest_release_date = 'Already covered by honduras_portal_releases' def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index 33ba9d976..9168cdf4b 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -19,6 +19,7 @@ class HondurasPortalRecords(LinksSpider): data_pointer = '/recordPackage' next_pointer = '/next' next_page_formatter = staticmethod(parameters('page')) + skip_latest_release_date = 'Already covered by honduras_portal_releases' download_delay = 0.9 diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 938b1113e..36f5970e2 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -12,6 +12,7 @@ class MoldovaOld(SimpleSpider): """ name = 'moldova_old' data_type = 'release_package' + skip_latest_release_date = 'Old endpoint' def start_requests(self): pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_records.py b/kingfisher_scrapy/spiders/paraguay_dncp_records.py index 721e666ad..8d3acfa9f 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_records.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_records.py @@ -17,6 +17,7 @@ class ParaguayDNCPRecords(ParaguayDNCPBaseSpider): """ name = 'paraguay_dncp_records' data_type = 'record_package' + skip_latest_release_date = 'Already covered by paraguay_dncp_releases' def get_files_to_download(self, content): for record in content['records']: diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index f71bd70af..af04bad63 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -9,6 +9,7 @@ class TestFail(SimpleSpider): name = 'test_fail' data_type = 'release_package' + skip_latest_release_date = 'Not a real spider' def start_requests(self): # Fine diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index 5fd8407c7..354ce75fd 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -15,7 +15,7 @@ class UKContractsFinder(SimpleSpider): encoding = 'iso-8859-1' def start_requests(self): - url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1' + url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=desc&page=1' yield self.build_request(url, formatter=parameters('page'), callback=self.parse_list) @handle_http_error diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py index b3053ed4e..42b91077c 100644 --- a/kingfisher_scrapy/spiders/uruguay_records.py +++ b/kingfisher_scrapy/spiders/uruguay_records.py @@ -12,6 +12,7 @@ class UruguayRecords(UruguayBase): """ name = 'uruguay_records' data_type = 'record_package' + skip_latest_release_date = 'Already covered by uruguay_releases' @handle_http_error def parse_list(self, response): diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 3e1972b4f..4c7e7a58e 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -6,8 +6,8 @@ import pytest from scrapy.exceptions import NotConfigured -from kingfisher_scrapy.extensions import KingfisherFilesStore, KingfisherProcessAPI -from kingfisher_scrapy.items import FileError +from kingfisher_scrapy.extensions import KingfisherFilesStore, KingfisherLatestDate, KingfisherProcessAPI +from kingfisher_scrapy.items import FileError, LatestReleaseDateItem from tests import spider_with_crawler @@ -365,3 +365,30 @@ def test_build_file_with_existing_directory(): # No FileExistsError exception. store_extension.item_scraped(spider.build_file(file_name='file.json', data=b'{"key": "value"}'), spider) + + +def test_item_scraped_latest_date(): + with TemporaryDirectory() as tmpdirname: + spider = spider_with_files_store(tmpdirname, latest=True) + spider.crawler.settings['KINGFISHER_LATEST_RELEASE_DATE_FILE_PATH'] = tmpdirname + + latest_extension = KingfisherLatestDate.from_crawler(spider.crawler) + item = LatestReleaseDateItem({'date': '2020-10-01T00:00:00Z'}) + latest_extension.item_scraped(item, spider) + + with open(os.path.join(tmpdirname, 'latest_dates.csv')) as f: + assert 'test,2020-10-01T00:00:00Z\n' == f.read() + + # the same item is processed just once + latest_extension.item_scraped(item, spider) + + with open(os.path.join(tmpdirname, 'latest_dates.csv')) as f: + assert 'test,2020-10-01T00:00:00Z\n' == f.read() + + # a non processed item is marked as an error + spider.name = 'no date' + + latest_extension.spider_closed(spider, 'itemcount') + + with open(os.path.join(tmpdirname, 'latest_dates.csv')) as f: + assert 'test,2020-10-01T00:00:00Z\nno date,itemcount\n' == f.read() diff --git a/tests/test_latest_release_date.py b/tests/test_latest_release_date.py new file mode 100644 index 000000000..396dae89e --- /dev/null +++ b/tests/test_latest_release_date.py @@ -0,0 +1,79 @@ +import json + +from kingfisher_scrapy.items import File, LatestReleaseDateItem +from kingfisher_scrapy.pipelines import LatestReleaseDate +from tests import spider_with_crawler + + +def test_process_item(): + spider = spider_with_crawler() + spider.latest = True + pipeline = LatestReleaseDate() + release_package = {"releases": [{"date": "2020-01-01T00:00:00Z"}, {"date": "2020-10-01T00:00:00Z"}]} + record_package = {"records": [release_package]} + item = File({ + 'file_name': 'test', + 'data': json.dumps(release_package), + 'data_type': 'release_package', + 'url': 'http://test.com', + }) + expected_item = LatestReleaseDateItem({'date': '2020-10-01T00:00:00Z'}) + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'release_list' + item['data'] = json.dumps(release_package['releases']) + spider.name = 'test3' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'release_package_list' + item['data'] = json.dumps([release_package]) + spider.name = 'test4' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'release_package_list_in_results' + item['data'] = json.dumps({'results': [release_package]}) + spider.name = 'test5' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'release' + item['data'] = json.dumps(release_package['releases'][1]) + spider.name = 'test6' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'compiled_release' + spider.name = 'test7' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record_package' + item['data'] = json.dumps(record_package) + spider.name = 'test2' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record' + item['data'] = json.dumps(record_package['records'][0]) + spider.name = 'test8' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record' + item['data'] = json.dumps({'compiledRelease': release_package['releases'][1]}) + spider.name = 'test-compiledRelease' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record_list' + item['data'] = json.dumps([record_package['records'][0]]) + spider.name = 'test9' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record_package_list' + item['data'] = json.dumps([record_package]) + spider.name = 'test10' + assert pipeline.process_item(item, spider) == expected_item + + item['data_type'] = 'record_package_list_in_results' + item['data'] = json.dumps({'results': [record_package]}) + spider.name = 'test11' + assert pipeline.process_item(item, spider) == expected_item + + spider.latest = False + spider.name = 'other' + assert pipeline.process_item(item, spider) == item