Skip to content

Commit

Permalink
Merge b6bb7de into 273a7b4
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jun 17, 2020
2 parents 273a7b4 + b6bb7de commit 2fb2e26
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
15 changes: 15 additions & 0 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
from kingfisher_scrapy.items import File, FileItem


class Validate:
def __init__(self):
self.file_names = set()
self.file_items = set()

def process_item(self, item, spider):
if hasattr(item, 'validate'):
# We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
# as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
item.validate()

if isinstance(item, FileItem):
if (item['file_name'], item['number']) in self.file_items:
spider.logger.warning('Duplicated filename and number pair: {}-{}'.format(item['file_name'],
item['number']))
self.file_items.add((item['file_name'], item['number']))
elif isinstance(item, File):
if item['file_name'] in self.file_names:
spider.logger.warning('Duplicated filename: {}'.format(item['file_name']))
self.file_names.add(item['file_name'])

return item
42 changes: 41 additions & 1 deletion tests/test_validate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pytest

from kingfisher_scrapy.exceptions import MissingRequiredFieldError
from kingfisher_scrapy.items import File
from kingfisher_scrapy.items import File, FileItem
from kingfisher_scrapy.pipelines import Validate
from tests import spider_with_crawler


def test_process_item():
Expand All @@ -23,3 +24,42 @@ def test_process_item_error():

with pytest.raises(MissingRequiredFieldError):
pipeline.process_item(item, None)


def test_duplicated_filename(caplog):
pipeline = Validate()
spider = spider_with_crawler()
item = File({
'file_name': 'test1',
'data': '',
'data_type': '',
'url': '',
})
assert pipeline.process_item(item, spider) == item
pipeline.process_item(item, spider)
assert caplog.messages[0] == 'Duplicated filename: test1'
assert len(pipeline.file_names) == 1
item2 = item.copy()
item2['file_name'] = 'file2'
pipeline.process_item(item2, spider)
assert len(pipeline.file_names) == 2


def test_duplicated_fileitem(caplog):
pipeline = Validate()
spider = spider_with_crawler()
item = FileItem({
'file_name': 'test1',
'data': '',
'data_type': '',
'url': '',
'number': 1
})
assert pipeline.process_item(item, spider) == item
pipeline.process_item(item, spider)
assert caplog.messages[0] == 'Duplicated filename and number pair: test1-1'
assert len(pipeline.file_items) == 1
item2 = item.copy()
item2['number'] = 2
pipeline.process_item(item2, spider)
assert len(pipeline.file_items) == 2

0 comments on commit 2fb2e26

Please sign in to comment.