Skip to content

Commit

Permalink
Check next link on first page
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Jun 19, 2020
1 parent 8d7a4ba commit 0128bbe
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 2 deletions.
5 changes: 4 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from jsonpointer import resolve_pointer

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.exceptions import KingfisherScrapyError, SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.util import handle_http_error

Expand Down Expand Up @@ -377,3 +377,6 @@ def next_link(self, response):
url = resolve_pointer(data, self.next_pointer, None)
if url:
return self.build_request(url, formatter=self.next_page_formatter)

if response.meta['depth'] == 0:
raise KingfisherScrapyError('next link not found on the first page: {}'.format(response.url))
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def response_fixture(**kwargs):
request = Request('http://example.com', meta={'file_name': 'test'})
request = Request('http://example.com', meta={'file_name': 'test', 'depth': 0})
if 'status' not in kwargs:
kwargs['status'] = 200
if 'body' not in kwargs:
Expand Down
10 changes: 10 additions & 0 deletions tests/test_links_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from scrapy.http import Request

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.exceptions import KingfisherScrapyError
from kingfisher_scrapy.items import File, FileError
from tests import response_fixture, spider_with_crawler

Expand Down Expand Up @@ -59,3 +60,12 @@ def test_parse_200():

with pytest.raises(StopIteration):
next(generator)


def test_next_link_not_found():
spider = spider_with_crawler(spider_class=LinksSpider)
spider.next_page_formatter = lambda url: 'next.json'

with pytest.raises(KingfisherScrapyError) as e:
assert spider.next_link(response_fixture(body='{"links": {"next": ""}}'))
assert str(e.value) == 'next link not found on the first page: http://example.com'

0 comments on commit 0128bbe

Please sign in to comment.