Skip to content

Commit

Permalink
Use keyword-only arguments for build_file and build_file_item. Update…
Browse files Browse the repository at this point in the history
… docs.
  • Loading branch information
jpmckinney committed May 30, 2020
1 parent 036bb9a commit e59db78
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 32 deletions.
2 changes: 1 addition & 1 deletion docs/writing-spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Here is a sample:
@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package')
yield self.build_file_from_response(response, data_type='release_package')
Spider properties
-----------------
Expand Down
37 changes: 19 additions & 18 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,19 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def build_file_from_response(self, response, file_name=None, url=None, data=None, data_type=None, encoding='utf-8',
post_to_api=True):
def build_file_from_response(self, response, **kwargs):
"""
Returns an item to yield, based on the response to a request.
"""
if not file_name:
file_name = response.request.meta['kf_filename']
if not url:
url = response.request.url
if not data:
data = response.body
return self.build_file(data, file_name, url, data_type, encoding, post_to_api)

def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8', post_to_api=True):
if 'file_name' not in kwargs:
kwargs['file_name'] = response.request.meta['kf_filename']
if 'url' not in kwargs:
kwargs['url'] = response.request.url
if 'data' not in kwargs:
kwargs['data'] = response.body
return self.build_file(**kwargs)

def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
Expand All @@ -125,7 +124,7 @@ def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8'
'post_to_api': post_to_api,
})

def build_file_item(self, number, data, data_type, url, encoding, file_name):
def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
return FileItem({
'number': number,
'file_name': file_name,
Expand Down Expand Up @@ -165,7 +164,8 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type,
encoding=encoding)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
Expand All @@ -179,7 +179,8 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type,
encoding=encoding)
if self.sample:
break

Expand Down Expand Up @@ -236,7 +237,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
"""
if file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.build_file_from_response(response, filename, post_to_api=False)
self.build_file_from_response(response, file_name=filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -254,8 +255,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
else:
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(file_name=filename, data=data.read(), url=response.request.url,
data_type=data_type, encoding=encoding)


class LinksSpider(BaseSpider):
Expand Down Expand Up @@ -286,7 +287,7 @@ def start_requests(self):

@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type)
yield self.build_file_from_response(response, data_type=self.data_type)

if not self.sample:
yield self.next_link(response)
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def parse(self, response):
with rarfile.RarFile(file.name, charset='utf-8') as tmpfile:
for f in tmpfile.infolist():
with tmpfile.open(f) as jsonFile:
yield self.build_file(jsonFile.read(), f.filename, data_type='release_package',
url=response.request.url)
yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(),
data_type='release_package')
os.remove(file.name)
else:
filename = response.request.url.split('/')[-1]
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/openopps.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ def parse(self, response):

if all_data:
yield self.build_file(
all_data,
file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json',
url=response.request.url,
data=all_data,
data_type='release_package_list'
)
if self.sample:
Expand Down
6 changes: 4 additions & 2 deletions tests/test_base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def test_build_file_from_response():
response.request = Mock()
response.request.url = 'https://example.com/remote.json'

actual = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1')
actual = spider.build_file_from_response(response, file_name='file.json', data_type='release_package',
encoding='iso-8859-1')

assert actual == File({
'file_name': 'file.json',
Expand All @@ -63,7 +64,8 @@ def test_build_file():
data = b'{"key": "value"}'
url = 'https://example.com/remote.json'

actual = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1')
actual = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package',
encoding='iso-8859-1')

assert actual == File({
'file_name': 'file.json',
Expand Down
18 changes: 10 additions & 8 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d
kwargs = {}
if encoding:
kwargs['encoding'] = encoding
item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json',
item = spider.build_file(file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}',
data_type='release_package', post_to_api=post_to_api, **kwargs)

store_extension.item_scraped(item, spider)
Expand Down Expand Up @@ -146,12 +146,12 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok
if encoding:
kwargs['encoding'] = encoding
item = spider.build_file_item(
1,
b'{"key": "value"}',
data_type='release_package',
number=1,
file_name='data.json',
url='https://example.com/remote.json',
data=b'{"key": "value"}',
data_type='release_package',
encoding=encoding2,
file_name='data.json',
)

api_extension.item_scraped(item, spider)
Expand Down Expand Up @@ -294,7 +294,8 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir):
response.request = Mock()
response.request.url = 'https://example.com/remote.json'

item = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1')
item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package',
encoding='iso-8859-1')
store_extension.item_scraped(item, spider)

with open(tmpdir.join(path)) as f:
Expand Down Expand Up @@ -322,7 +323,8 @@ def test_item_scraped_with_build_file(sample, path, tmpdir):
data = b'{"key": "value"}'
url = 'https://example.com/remote.json'

item = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1')
item = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package',
encoding='iso-8859-1')
store_extension.item_scraped(item, spider)

with open(tmpdir.join(path)) as f:
Expand All @@ -349,4 +351,4 @@ def test_build_file_with_existing_directory():
os.makedirs(os.path.join(files_store, 'test', '20010203_040506'))

# No FileExistsError exception.
store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider)
store_extension.item_scraped(spider.build_file(file_name='file.json', data=b'{"key": "value"}'), spider)

0 comments on commit e59db78

Please sign in to comment.