Check next link on first page

open-contracting · Jun 19, 2020 · 0128bbe · 0128bbe
1 parent 8d7a4ba
commit 0128bbe
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 2 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -8,7 +8,7 @@
 from jsonpointer import resolve_pointer
 
 from kingfisher_scrapy import util
-from kingfisher_scrapy.exceptions import SpiderArgumentError
+from kingfisher_scrapy.exceptions import KingfisherScrapyError, SpiderArgumentError
 from kingfisher_scrapy.items import File, FileError, FileItem
 from kingfisher_scrapy.util import handle_http_error
 
@@ -377,3 +377,6 @@ def next_link(self, response):
         url = resolve_pointer(data, self.next_pointer, None)
         if url:
             return self.build_request(url, formatter=self.next_page_formatter)
+
+        if response.meta['depth'] == 0:
+            raise KingfisherScrapyError('next link not found on the first page: {}'.format(response.url))
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -8,7 +8,7 @@
 
 
 def response_fixture(**kwargs):
-    request = Request('http://example.com', meta={'file_name': 'test'})
+    request = Request('http://example.com', meta={'file_name': 'test', 'depth': 0})
     if 'status' not in kwargs:
         kwargs['status'] = 200
     if 'body' not in kwargs:

diff --git a/tests/test_links_spider.py b/tests/test_links_spider.py
@@ -2,6 +2,7 @@
 from scrapy.http import Request
 
 from kingfisher_scrapy.base_spider import LinksSpider
+from kingfisher_scrapy.exceptions import KingfisherScrapyError
 from kingfisher_scrapy.items import File, FileError
 from tests import response_fixture, spider_with_crawler
 
@@ -59,3 +60,12 @@ def test_parse_200():
 
     with pytest.raises(StopIteration):
         next(generator)
+
+
+def test_next_link_not_found():
+    spider = spider_with_crawler(spider_class=LinksSpider)
+    spider.next_page_formatter = lambda url: 'next.json'
+
+    with pytest.raises(KingfisherScrapyError) as e:
+        assert spider.next_link(response_fixture(body='{"links": {"next": ""}}'))
+    assert str(e.value) == 'next link not found on the first page: http://example.com'