Skip to content

Commit

Permalink
Merge pull request #504 from open-contracting/470-querystring-params
Browse files Browse the repository at this point in the history
Add query parameters to `start_requests`' URL
  • Loading branch information
yolile committed Sep 23, 2020
2 parents 7d452a8 + 6c66a01 commit 156e5a6
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 3 deletions.
21 changes: 19 additions & 2 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.util import handle_http_error
from kingfisher_scrapy.util import handle_http_error, request_add_qs


class BaseSpider(scrapy.Spider):
Expand Down Expand Up @@ -56,6 +56,18 @@ class BaseSpider(scrapy.Spider):
.. code:: bash
scrapy crawl spider_name -a keep_collection_open=true
Add a GET parameter to the start URLs (returned by `start_requests`):
.. code:: bash
scrapy crawl spider_name -a qs=param1:value,param2:value2
If the parameter value contains a comma, use a backslash to escape it:
.. code:: bash
scrapy crawl spider_name -a qs=param:value\\,value2
"""

MAX_SAMPLE = 10
Expand All @@ -66,7 +78,7 @@ class BaseSpider(scrapy.Spider):
date_format = 'date'

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, qs=None, *args,
**kwargs):
super().__init__(*args, **kwargs)

Expand All @@ -81,10 +93,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, craw
self.package_pointer = package_pointer
self.release_pointer = release_pointer
self.truncate = int(truncate) if truncate else None
self.qs = qs

self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.pluck = bool(package_pointer or release_pointer)

if self.qs and hasattr(self, 'start_requests'):
self.start_requests = request_add_qs(self.start_requests, qs)

spider_arguments = {
'sample': sample,
'note': note,
Expand All @@ -95,6 +111,7 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, craw
'package_pointer': package_pointer,
'release_pointer': release_pointer,
'truncate': truncate,
'qs': qs
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN')
KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET = os.getenv('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET')

# To get an API account, visit https://contrataciones.gov.py/datos/signup
# To get an API account, visit https://www.contrataciones.gov.py/datos/adm/signup
KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN = os.getenv('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN')

# To get an API account, contact contact@openopps.com.
Expand Down
12 changes: 12 additions & 0 deletions kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
import json
import re
from datetime import date
from decimal import Decimal
from functools import wraps
Expand Down Expand Up @@ -173,3 +174,14 @@ def default(obj):
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)


def request_add_qs(func, qs):
pattern = re.compile(r',?([^:,]+):((?:[^:,]+?(?:\\\,)?)+)')
params = {key: value.replace('\\', '') for (key, value) in pattern.findall(qs)}

def wrapper(*args, **kwargs):
for request in func(*args, **kwargs):
url = replace_parameters(request.url, **params)
yield request.replace(url=url)
return wrapper
16 changes: 16 additions & 0 deletions tests/test_base_spider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import Mock

import pytest
import scrapy
from scrapy.http import TextResponse

from kingfisher_scrapy.base_spider import BaseSpider
Expand Down Expand Up @@ -101,3 +102,18 @@ def test_custom_collection_data_version():
assert spider_with_crawler(crawl_time='2020')
assert str(e.value) == 'spider argument crawl_time: invalid date value: {}'.format(
error_message)


@pytest.mark.parametrize('arguments,expected',
(['param1:val1', '?param1=val1'],
['param1:val1,param2:val2', '?param1=val1&param2=val2'],
['param1:val1,param2:Ministerio de Urbanismo\\, Vivienda y Habitat',
'?param1=val1&param2=Ministerio+de+Urbanismo%2C+Vivienda+y+Habitat']))
def test_qs_parameters(arguments, expected):
test_spider = type('TestSpider', (BaseSpider,), {
'start_requests': lambda _self: [scrapy.Request('http://example.com')]
})
spider = spider_with_crawler(test_spider, qs=arguments)

for request in spider.start_requests():
assert expected in request.url

0 comments on commit 156e5a6

Please sign in to comment.