Skip to content

Commit

Permalink
Merge acb59fe into 9fd1ea4
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Sep 2, 2020
2 parents 9fd1ea4 + acb59fe commit 3ad013b
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
15 changes: 4 additions & 11 deletions kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from urllib.parse import parse_qs, urlsplit

import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters, replace_parameter
from kingfisher_scrapy.util import get_parameter_value, parameters, replace_parameter

MILLISECONDS_PER_DAY = 86400000
EXPONENT_LIMIT = 10 # 1024 days
Expand Down Expand Up @@ -42,13 +40,13 @@ def parse(self, response):
yield self.build_file_error_from_response(response)

# If the error occurs on the first request, we have no starting offset.
if self.get_offset(response):
if get_parameter_value(response.request.url, 'offset'):
yield from self.parse_date_range(response)

# Exponential search (https://en.wikipedia.org/wiki/Exponential_search). We can do an elaborate alternative
# (https://www.slac.stanford.edu/cgi-bin/getdoc/slac-pub-1679.pdf), but we keep it simpler for now.
def parse_date_range(self, response):
offset = self.get_offset(response)
offset = get_parameter_value(response.request.url, 'offset')

# Scrapy uses `datetime.datetime.utcnow()`, so we don't need to worry about time zones.
start_time = int(self.crawler.stats.get_value('start_time').timestamp() * 1000)
Expand All @@ -74,7 +72,7 @@ def parse_date_range(self, response):
# We use one of the alternative binary search methods (https://en.wikipedia.org/wiki/Binary_search_algorithm),
# because we only know if an offset succeeds, not whether an offset is greater than the target value.
def parse_binary_search(self, response, minimum=None, maximum=None):
offset = self.get_offset(response)
offset = get_parameter_value(response.request.url, 'offset')

first_offset = response.request.meta['first']

Expand Down Expand Up @@ -106,8 +104,3 @@ def _build_request(self, url, callback, meta):
meta['dont_retry'] = True
# We need to set `formatter` in case we want to re-use the response to build a file or file error.
return self.build_request(url, formatter=parameters('offset'), dont_filter=True, meta=meta, callback=callback)

def get_offset(self, response):
query = parse_qs(urlsplit(response.request.url).query)
if 'offset' in query:
return int(query['offset'][0])
6 changes: 2 additions & 4 deletions kingfisher_scrapy/spiders/portugal_base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from io import BytesIO
from urllib.parse import parse_qs, urlsplit

import ijson
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters, replace_parameter
from kingfisher_scrapy.util import get_parameter_value, handle_http_error, parameters, replace_parameter


class PortugalBase(SimpleSpider):
Expand All @@ -27,7 +26,6 @@ def parse_data(self, response):

if not self.sample:
next_url = response.request.url
query = parse_qs(urlsplit(next_url).query)
offset = int(query['offset'][0])
offset = get_parameter_value(next_url, 'offset')
url = replace_parameter(next_url, 'offset', offset + 1)
yield self.build_request(url, formatter=parameters('offset'), callback=self.parse_data)
9 changes: 9 additions & 0 deletions kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,12 @@ def default(obj):
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)


def get_parameter_value(url, key):
"""
Returns the value of the given ``key`` in ``url``.
"""
query = parse_qs(urlsplit(url).query)
if key in query:
return int(query[key][0])
11 changes: 10 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from kingfisher_scrapy.util import replace_parameter
from kingfisher_scrapy.util import get_parameter_value, replace_parameter


@pytest.mark.parametrize('url,value,expected', [
Expand All @@ -10,3 +10,12 @@
])
def test_replace_parameter(url, value, expected):
assert replace_parameter(url, 'page', value) == expected


@pytest.mark.parametrize('url,parameter_name,expected', [
('http://example.com/?page=1', 'page', 1),
('http://example.com/?page=1', None, 1),
('http://example.com/?page', None, None),
])
def test_get_parameter_value(url, parameter_name, expected):
assert get_parameter_value(url, 'page') == expected

0 comments on commit 3ad013b

Please sign in to comment.