Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support rar and zip files in the same spider #602

Merged
merged 3 commits into from
Jan 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def build_request(self, url, formatter, **kwargs):
:rtype: scrapy.Request
"""
file_name = formatter(url)
if not file_name.endswith(('.json', '.zip', '.xlsx', '.csv')):
if not file_name.endswith(('.json', '.zip', '.xlsx', '.csv', '.rar')):
file_name += '.json'
meta = {'file_name': file_name}
if 'meta' in kwargs:
Expand Down Expand Up @@ -335,7 +335,6 @@ class CompressedFileSpider(BaseSpider):
#. Inherit from ``CompressedFileSpider``
#. Set a ``data_type`` class attribute to the data type of the compressed files
#. Optionally, set an ``encoding`` class attribute to the encoding of the compressed files (default UTF-8)
#. Optionally, set a ``archive_format`` class attribute to the archive file format ("zip" or "rar").
#. Optionally, set a ``compressed_file_format`` class attribute to the format of the compressed files

``json_lines``
Expand Down Expand Up @@ -367,14 +366,15 @@ def start_requests(self):
encoding = 'utf-8'
skip_pluck = 'Archive files are not supported'
compressed_file_format = None
archive_format = 'zip'
file_name_must_contain = ''

@handle_http_error
def parse(self, response):
archive_name, archive_format = os.path.splitext(response.request.meta['file_name'])
archive_format = archive_format[1:].lower()
if self.compressed_file_format:
yield self.build_file_from_response(response, data_type=self.archive_format, post_to_api=False)
if self.archive_format == 'zip':
yield self.build_file_from_response(response, data_type=archive_format, post_to_api=False)
if archive_format == 'zip':
cls = ZipFile
else:
cls = RarFile
Expand All @@ -384,9 +384,9 @@ def parse(self, response):
basename = os.path.basename(filename)
if self.file_name_must_contain not in basename:
continue
if self.archive_format == 'rar' and file_info.isdir():
if archive_format == 'rar' and file_info.isdir():
continue
if self.archive_format == 'zip' and file_info.is_dir():
if archive_format == 'zip' and file_info.is_dir():
continue
if not basename.endswith('.json'):
basename += '.json'
Expand Down
8 changes: 3 additions & 5 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class DominicanRepublic(CompressedFileSpider):
name = 'dominican_republic'
data_type = 'release_package'
compressed_file_format = 'release_package'
archive_format = 'rar'

def start_requests(self):
yield scrapy.Request(
Expand All @@ -25,9 +24,8 @@ def start_requests(self):

@handle_http_error
def parse_list(self, response):
urls = response.css('.fileLink::attr(href)').getall()
json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls))
urls = response.css('.download::attr(href)').getall()
json_urls = list(filter(lambda x: '/JSON' in x, urls))

for url in json_urls:
if '/JSON_DGCP_' in url:
yield self.build_request(url, formatter=components(-1))
yield self.build_request(url, formatter=components(-1))
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/mexico_nuevo_leon_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@


class MexicoNuevoLeonBase(CompressedFileSpider):
archive_format = 'rar'

def start_requests(self):
yield self.build_request(
Expand Down
15 changes: 7 additions & 8 deletions tests/test_compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_parse():
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
zipfile.writestr('test.json', '{}')

response = response_fixture(body=io.getvalue())
response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
generator = spider.parse(response)
item = next(generator)

Expand Down Expand Up @@ -51,14 +51,14 @@ def test_parse_json_lines(sample, len_items):
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
zipfile.writestr('test.json', ''.join(content))

response = response_fixture(body=io.getvalue())
response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
generator = spider.parse(response)
item = next(generator)
items = list(generator)

assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['file_name'] == 'test.zip'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
Expand Down Expand Up @@ -92,14 +92,14 @@ def test_parse_release_package(sample, len_items, len_releases):
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
zipfile.writestr('test.json', json.dumps(package))

response = response_fixture(body=io.getvalue())
response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
generator = spider.parse(response)
item = next(generator)
items = list(generator)

assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['file_name'] == 'test.zip'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
Expand All @@ -125,7 +125,7 @@ def test_parse_zip_empty_dir():
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
empty_folder = ZipInfo(os.path.join('test', 'test', '/'))
zipfile.writestr(empty_folder, '')
response = response_fixture(body=io.getvalue())
response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
generator = spider.parse(response)
with pytest.raises(StopIteration):
next(generator)
Expand All @@ -134,13 +134,12 @@ def test_parse_zip_empty_dir():
def test_parse_rar_file():
spider = spider_with_crawler(spider_class=CompressedFileSpider)
spider.data_type = 'release_package'
spider.archive_format = 'rar'

# the rar library does'nt support the write mode so we use a static rar file
rar_file_path = os.path.join(pathlib.Path(__file__).parent.absolute(), 'data', 'test.rar')
with open(rar_file_path, 'rb') as f:
io = BytesIO(f.read())
response = response_fixture(body=io.getvalue())
response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.rar'})
generator = spider.parse(response)
item = next(generator)

Expand Down