Skip to content

Commit

Permalink
docs: Add link to ActiveSupport, and justify single subdirectory
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Sep 26, 2023
1 parent de00b42 commit 5b2bf19
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 19 deletions.
6 changes: 3 additions & 3 deletions kingfisher_scrapy/extensions/database_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def create_table(self, table):

def yield_items_from_directory(self, crawl_directory, prefix=''):
for root, dirs, files in os.walk(crawl_directory):
for file_name in files:
if file_name.endswith('.json'):
with open(os.path.join(root, file_name), 'rb') as f:
for name in files:
if name.endswith('.json'):
with open(os.path.join(root, name), 'rb') as f:
yield from ijson.items(f, prefix)

# Copied from kingfisher-summarize
Expand Down
13 changes: 8 additions & 5 deletions kingfisher_scrapy/extensions/files_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math
import os
from zlib import adler32
import zlib

from scrapy import signals
from scrapy.exceptions import NotConfigured
Expand Down Expand Up @@ -79,15 +79,18 @@ def item_scraped(self, item, spider):
name, extension = util.get_file_name_and_extension(file_name)
file_name = f"{name}-{item['number']}.{extension}"

path = os.path.join(self.relative_crawl_directory(spider), self._get_hashed_path(file_name), file_name)
path = os.path.join(self.relative_crawl_directory(spider), self._get_subdirectory(file_name), file_name)
self._write_file(path, item['data'])

item['path'] = path

# https://github.com/rails/rails/blob/05ed261/activesupport/lib/active_support/cache/file_store.rb#L150-L175
@staticmethod
def _get_hashed_path(file_name):
hash_dir = adler32(file_name.encode())
hash_dir, dir_1 = divmod(hash_dir, 0x1000)
def _get_subdirectory(file_name):
checksum = zlib.adler32(file_name.encode())
checksum, dir_1 = divmod(checksum, 0x1000)
# 0x1000 is 4096, which should be sufficient, without another level.
# dir_2 = checksum % 0x1000
return "%03X" % dir_1

def _write_file(self, path, data):
Expand Down
10 changes: 5 additions & 5 deletions tests/extensions/test_files_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,15 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir):
('true', os.path.join('test_sample', '20010203_040506')),
])
@pytest.mark.parametrize('data', [b'{"key": "value"}', {"key": "value"}])
@pytest.mark.parametrize('item,expected_file_name', [
(File({'file_name': 'file.json'}), 'file.json'),
(FileItem({'number': 1, 'file_name': 'file.json'}), 'file-1.json')
@pytest.mark.parametrize('item,subdirectory,expected_file_name', [
(File({'file_name': 'file.json'}), '389', 'file.json'),
(FileItem({'number': 1, 'file_name': 'file.json'}), '3E7', 'file-1.json')
])
def test_item_scraped_with_file_and_file_item(sample, directory, data, item, expected_file_name, tmpdir):
def test_item_scraped_with_file_and_file_item(sample, directory, data, item, subdirectory, expected_file_name, tmpdir):
spider = spider_with_files_store(tmpdir, sample=sample)
extension = FilesStore.from_crawler(spider.crawler)

path = os.path.join(directory, FilesStore._get_hashed_path(expected_file_name), expected_file_name)
path = os.path.join(directory, subdirectory, expected_file_name)
original_file_name = item['file_name']
item['data'] = data
extension.item_scraped(item, spider)
Expand Down
12 changes: 6 additions & 6 deletions tests/extensions/test_kingfisher_process_api2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@
SKIP_TEST_IF = not RABBIT_URL and ('CI' not in os.environ or 'CI_SKIP' in os.environ)

items_scraped = [
('build_file', 'file.json', {
('build_file', '389', 'file.json', {
'file_name': 'file.json',
'url': 'https://example.com/remote.json',
'data': b'{"key": "value"}',
'data_type': 'release_package',
}),
(FileItem, 'file-1.json', {
(FileItem, '3E7', 'file-1.json', {
'number': 1,
'file_name': 'file.json',
'url': 'https://example.com/remote.json',
'data': b'{"key": "value"}',
'data_type': 'release_package',
}),
(FileError, 'file.json', {
(FileError, '389', 'file.json', {
'file_name': 'file.json',
'url': 'https://example.com/remote.json',
'errors': {'http_code': 500},
Expand Down Expand Up @@ -236,9 +236,9 @@ def test_spider_closed_missing_collection_id(tmpdir):


@pytest.mark.skipif(SKIP_TEST_IF, reason='RABBIT_URL must be set')
@pytest.mark.parametrize('initializer,filename,kwargs', items_scraped)
@pytest.mark.parametrize('initializer,directory,filename,kwargs', items_scraped)
@pytest.mark.parametrize('raises,infix', [(False, 'sent'), (True, 'failed')])
def test_item_scraped(initializer, filename, kwargs, raises, infix, tmpdir, caplog):
def test_item_scraped(initializer, directory, filename, kwargs, raises, infix, tmpdir, caplog):
spider = spider_with_files_store(tmpdir, settings={
'RABBIT_URL': RABBIT_URL,
'RABBIT_EXCHANGE_NAME': 'kingfisher_process_test',
Expand Down Expand Up @@ -279,7 +279,7 @@ def test_item_scraped(initializer, filename, kwargs, raises, infix, tmpdir, capl
if initializer is FileError:
expected['errors'] = '{"http_code": 500}'
else:
expected['path'] = os.path.join('test', '20010203_040506', FilesStore._get_hashed_path(filename), filename)
expected['path'] = os.path.join('test', '20010203_040506', directory, filename)

if raises:
extension._publish_to_rabbit.assert_called_once()
Expand Down

0 comments on commit 5b2bf19

Please sign in to comment.