Skip to content

Commit

Permalink
Add KingfisherStoreFiles extension to avoid storing files in spiders
Browse files Browse the repository at this point in the history
Signed-off-by: Yohanna Lisnichuk <yohanitalisnichuk@gmail.com>
  • Loading branch information
yolile committed May 19, 2020
1 parent 37b4676 commit c1c8a7a
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 136 deletions.
61 changes: 19 additions & 42 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,63 +100,40 @@ def get_local_file_path_excluding_filestore(self, filename):

def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8'):
"""
Writes the response's body to the filename in the crawl's directory.
Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
Sends a dict to KingfisherStoreFiles to store the data
"""
return self._save_response_to_disk(response.body, filename, response.request.url, data_type, encoding)
return {
'data': response.body,
'file_name': filename,
'url': response.request.url,
'data_type': data_type,
'encoding': encoding
}

def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8'):
"""
Writes the data to the filename in the crawl's directory.
Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
"""
return self._save_response_to_disk(data, filename, url, data_type, encoding)

def get_start_time(self, format):
"""
Returns the formatted start time of the crawl.
Sends a dict to KingfisherStoreFiles to store the data
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def _save_response_to_disk(self, data, filename, url, data_type, encoding):
self._write_file(filename, data)

metadata = {
return {
'data': data,
'file_name': filename,
'url': url,
'data_type': data_type,
'encoding': encoding,
'encoding': encoding
}

self._write_file(filename + '.fileinfo', metadata)

metadata['success'] = True
metadata['file_name'] = filename

return metadata

def _write_file(self, filename, data):
path = self.get_local_file_path_including_filestore(filename)
os.makedirs(os.path.dirname(path), exist_ok=True)

if isinstance(data, bytes):
mode = 'wb'
else:
mode = 'w'

with open(path, mode) as f:
if isinstance(data, (bytes, str)):
f.write(data)
else:
json.dump(data, f)

def _get_crawl_path(self):
name = self.name
if self.sample:
name += '_sample'
return os.path.join(name, self.get_start_time('%Y%m%d_%H%M%S'))

def get_start_time(self, format):
"""
Returns the formatted start time of the crawl.
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def _build_file_item(self, number, line, data_type, url, encoding):
return {
'success': True,
Expand Down
50 changes: 50 additions & 0 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,56 @@


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherStoreFiles:
def __init__(self, directory, stats):
self.directory = directory
self.stats = stats

@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings['FILES_STORE']
extension = cls(directory, crawler.stats)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
return extension

def item_scraped(self, item, spider):
"""
Writes the response's body to the filename in the crawl's directory.
Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
"""
self._write_file(item['file_name'], item['data'], spider)

metadata = {
'url': item['url'],
'data_type': item['data_type'],
'encoding': item['encoding'],
}

self._write_file(item['file_name'] + '.fileinfo', metadata, spider)

metadata['success'] = True
metadata['file_name'] = item['file_name']
item['success'] = True

return metadata

def _write_file(self, filename, data, spider):
path = spider.get_local_file_path_including_filestore(filename)
os.makedirs(os.path.dirname(path), exist_ok=True)

if isinstance(data, bytes):
mode = 'wb'
else:
mode = 'w'

with open(path, mode) as f:
if isinstance(data, (bytes, str)):
f.write(data)
else:
json.dump(data, f)


class KingfisherAPI:
def __init__(self, url, key, directory=None):
"""
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
EXTENSIONS = {
'kingfisher_scrapy.extensions.KingfisherAPI': 0,
'kingfisher_scrapy.extensions.KingfisherStoreFiles': 100,
'kingfisher_scrapy.extensions.KingfisherAPI': 500,
}

# Configure item pipelines
Expand Down
97 changes: 6 additions & 91 deletions tests/test_base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,91 +52,6 @@ def test_get_local_file_path_excluding_filestore(sample, expected):
assert spider.get_local_file_path_excluding_filestore('file.json') == expected


@pytest.mark.parametrize('sample,path', [
(None, 'test/20010203_040506/file.json'),
('true', 'test_sample/20010203_040506/file.json'),
])
def test_save_response_to_disk(sample, path):
spider = spider_with_crawler(sample=sample)

with TemporaryDirectory() as tmpdirname:
files_store = os.path.join(tmpdirname, 'data')
spider.crawler.settings['FILES_STORE'] = files_store

response = Mock()
response.body = b'{"key": "value"}'
response.request = Mock()
response.request.url = 'https://example.com/remote.json'

actual = spider.save_response_to_disk(response, 'file.json', data_type='release_package',
encoding='iso-8859-1')

with open(os.path.join(files_store, path)) as f:
assert f.read() == '{"key": "value"}'

with open(os.path.join(files_store, path + '.fileinfo')) as f:
assert json.load(f) == {
'url': 'https://example.com/remote.json',
'data_type': 'release_package',
'encoding': 'iso-8859-1',
}

assert actual == {
'success': True,
'file_name': 'file.json',
"data_type": 'release_package',
"url": 'https://example.com/remote.json',
'encoding': 'iso-8859-1',
}


@pytest.mark.parametrize('sample,path', [
(None, 'test/20010203_040506/file.json'),
('true', 'test_sample/20010203_040506/file.json'),
])
def test_save_data_to_disk(sample, path):
spider = spider_with_crawler(sample=sample)

with TemporaryDirectory() as tmpdirname:
files_store = os.path.join(tmpdirname, 'data')
spider.crawler.settings['FILES_STORE'] = files_store

data = b'{"key": "value"}'
url = 'https://example.com/remote.json'

actual = spider.save_data_to_disk(data, 'file.json', url=url, data_type='release_package',
encoding='iso-8859-1')

with open(os.path.join(files_store, path)) as f:
assert f.read() == '{"key": "value"}'

with open(os.path.join(files_store, path + '.fileinfo')) as f:
assert json.load(f) == {
'url': 'https://example.com/remote.json',
'data_type': 'release_package',
'encoding': 'iso-8859-1',
}

assert actual == {
'success': True,
'file_name': 'file.json',
"data_type": 'release_package',
"url": 'https://example.com/remote.json',
'encoding': 'iso-8859-1',
}


def test_save_data_to_disk_with_existing_directory():
spider = spider_with_crawler()

with TemporaryDirectory() as tmpdirname:
files_store = os.path.join(tmpdirname, 'data')
spider.crawler.settings['FILES_STORE'] = files_store
os.makedirs(os.path.join(files_store, 'test/20010203_040506'))

spider.save_data_to_disk(b'{"key": "value"}', 'file.json') # no FileExistsError exception


def test_next_link():
url = 'https://example.com/remote.json'
text_response = text.TextResponse('test')
Expand Down Expand Up @@ -173,7 +88,7 @@ def test_parse_next_link_200():
spider = spider_with_crawler(spider_class=LinksSpider)
spider.crawler.settings['FILES_STORE'] = files_store
actual = spider.parse_next_link(response, None).__next__()
assert actual['success'] is True and actual['file_name'] == 'test'
assert actual['file_name'] == 'test'
for item in spider.parse_next_link(response, None):
assert item

Expand Down Expand Up @@ -209,7 +124,7 @@ def test_parse_zipfile_200():
spider = spider_with_crawler(spider_class=ZipSpider)
spider.crawler.settings['FILES_STORE'] = files_store
actual = spider.parse_zipfile(response, None).__next__()
assert actual['success'] is True and actual['file_name'].find('.json')
assert actual['file_name'].find('.json')


def test_parse_zipfile_json_lines():
Expand All @@ -232,12 +147,12 @@ def test_parse_zipfile_json_lines():
spider = spider_with_crawler(spider_class=ZipSpider)
spider.crawler.settings['FILES_STORE'] = files_store
actual = spider.parse_zipfile(response, None, file_format='json_lines').__next__()
assert actual['success'] is True and actual['number'] == 1
assert actual['number'] == 1
spider.sample = True
total = 0
for item in spider.parse_zipfile(response, None, file_format='json_lines'):
total = total + 1
assert item['success'] is True and item['number'] == total
assert item['number'] == total
assert total == 10


Expand Down Expand Up @@ -265,7 +180,7 @@ def test_parse_zipfile_release_package():
spider.crawler.settings['FILES_STORE'] = files_store
actual = spider.parse_zipfile(response, None, file_format='release_package').__next__()
data = json.loads(actual['data'])
assert actual['success'] is True and actual['number'] == 1
assert actual['number'] == 1
assert data['publisher']['name'] == 'test'
assert data['extensions'] == ['a', 'b']
assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE
Expand All @@ -274,7 +189,7 @@ def test_parse_zipfile_release_package():
for item in spider.parse_zipfile(response, None, file_format='release_package'):
total = total + 1
data = json.loads(item['data'])
assert item['success'] is True and item['number'] == total
assert item['number'] == total
assert len(data['releases']) == spider.MAX_SAMPLE
assert total == 1

Expand Down
Loading

0 comments on commit c1c8a7a

Please sign in to comment.