From adbeb2ea45d2c7934bbebfc8af40af0e4dc73fbb Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 17 Jun 2020 13:49:29 -0400 Subject: [PATCH 1/3] Add tests to validation pipeline Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/pipelines.py | 15 ++++++++++++ tests/test_validate.py | 42 +++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 2fc034159..8807cf662 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -1,12 +1,27 @@ # https://docs.scrapy.org/en/latest/topics/item-pipeline.html # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals +from kingfisher_scrapy.items import File, FileItem class Validate: + def __init__(self): + self.file_names = set() + self.file_items = set() + def process_item(self, item, spider): if hasattr(item, 'validate'): # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. item.validate() + if isinstance(item, FileItem): + if (item['file_name'], item['number']) in self.file_items: + spider.logger.warning('Duplicated filename and number pair: {}-{}'.format(item['file_name'], + item['number'])) + self.file_items.add((item['file_name'], item['number'])) + elif isinstance(item, File): + if item['file_name'] in self.file_names: + spider.logger.warning('Duplicated filename: {}'.format(item['file_name'])) + self.file_names.add(item['file_name']) + return item diff --git a/tests/test_validate.py b/tests/test_validate.py index 9ebac699c..c44c16926 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,8 +1,9 @@ import pytest from kingfisher_scrapy.exceptions import MissingRequiredFieldError -from kingfisher_scrapy.items import File +from kingfisher_scrapy.items import File, FileItem from kingfisher_scrapy.pipelines import Validate +from tests import spider_with_crawler def test_process_item(): @@ -23,3 +24,42 @@ def test_process_item_error(): with pytest.raises(MissingRequiredFieldError): pipeline.process_item(item, None) + + +def test_duplicated_filename(caplog): + pipeline = Validate() + spider = spider_with_crawler() + item = File({ + 'file_name': 'test1', + 'data': '', + 'data_type': '', + 'url': '', + }) + assert pipeline.process_item(item, spider) == item + pipeline.process_item(item, spider) + assert caplog.messages[0] == 'Duplicated filename: test1' + assert len(pipeline.file_names) == 1 + item2 = item.copy() + item2['file_name'] = 'file2' + pipeline.process_item(item2, spider) + assert len(pipeline.file_names) == 2 + + +def test_duplicated_fileitem(caplog): + pipeline = Validate() + spider = spider_with_crawler() + item = FileItem({ + 'file_name': 'test1', + 'data': '', + 'data_type': '', + 'url': '', + 'number': 1 + }) + assert pipeline.process_item(item, spider) == item + pipeline.process_item(item, spider) + assert caplog.messages[0] == 'Duplicated filename and number pair: test1-1' + assert len(pipeline.file_items) == 1 + item2 = item.copy() + item2['number'] = 2 + pipeline.process_item(item2, spider) + assert len(pipeline.file_items) == 2 From abff011a67de1b5f9b2ff34df163fef83bc547ca Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 22 Jun 2020 22:21:17 -0400 Subject: [PATCH 2/3] pipelines: Reduce repetition of unique keys. Use consistent naming scheme. --- kingfisher_scrapy/pipelines.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 8807cf662..72255757e 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -5,7 +5,7 @@ class Validate: def __init__(self): - self.file_names = set() + self.files = set() self.file_items = set() def process_item(self, item, spider): @@ -15,13 +15,14 @@ def process_item(self, item, spider): item.validate() if isinstance(item, FileItem): - if (item['file_name'], item['number']) in self.file_items: - spider.logger.warning('Duplicated filename and number pair: {}-{}'.format(item['file_name'], - item['number'])) - self.file_items.add((item['file_name'], item['number'])) + key = (item['file_name'], item['number']) + if key in self.file_items: + spider.logger.warning('Duplicate FileItem: {!r}'.format(key)) + self.file_items.add(key) elif isinstance(item, File): - if item['file_name'] in self.file_names: - spider.logger.warning('Duplicated filename: {}'.format(item['file_name'])) - self.file_names.add(item['file_name']) + key = item['file_name'] + if key in self.files: + spider.logger.warning('Duplicate File: {!r}'.format(key)) + self.files.add(key) return item From f0e60060f92125bf44304af247973776424acf55 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 22 Jun 2020 22:28:08 -0400 Subject: [PATCH 3/3] pipelines: Don't test class internals. Add whitespace between assertions and other code. --- tests/test_validate.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index c44c16926..a89fa4ba3 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -26,7 +26,7 @@ def test_process_item_error(): pipeline.process_item(item, None) -def test_duplicated_filename(caplog): +def test_duplicate_file(caplog): pipeline = Validate() spider = spider_with_crawler() item = File({ @@ -35,17 +35,18 @@ def test_duplicated_filename(caplog): 'data_type': '', 'url': '', }) - assert pipeline.process_item(item, spider) == item + + pipeline.process_item(item, spider) pipeline.process_item(item, spider) - assert caplog.messages[0] == 'Duplicated filename: test1' - assert len(pipeline.file_names) == 1 item2 = item.copy() item2['file_name'] = 'file2' pipeline.process_item(item2, spider) - assert len(pipeline.file_names) == 2 + + assert len(caplog.messages) == 1 + assert caplog.messages[0] == "Duplicate File: 'test1'" -def test_duplicated_fileitem(caplog): +def test_duplicate_file_item(caplog): pipeline = Validate() spider = spider_with_crawler() item = FileItem({ @@ -55,11 +56,12 @@ def test_duplicated_fileitem(caplog): 'url': '', 'number': 1 }) - assert pipeline.process_item(item, spider) == item + + pipeline.process_item(item, spider) pipeline.process_item(item, spider) - assert caplog.messages[0] == 'Duplicated filename and number pair: test1-1' - assert len(pipeline.file_items) == 1 item2 = item.copy() item2['number'] = 2 pipeline.process_item(item2, spider) - assert len(pipeline.file_items) == 2 + + assert len(caplog.messages) == 1 + assert caplog.messages[0] == "Duplicate FileItem: ('test1', 1)"