Skip to content

Commit

Permalink
docs: Add link to ActiveSupport, and justify single subdirectory
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Sep 26, 2023
1 parent de00b42 commit 2fa447e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
6 changes: 3 additions & 3 deletions kingfisher_scrapy/extensions/database_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def create_table(self, table):

def yield_items_from_directory(self, crawl_directory, prefix=''):
for root, dirs, files in os.walk(crawl_directory):
for file_name in files:
if file_name.endswith('.json'):
with open(os.path.join(root, file_name), 'rb') as f:
for name in files:
if name.endswith('.json'):
with open(os.path.join(root, name), 'rb') as f:
yield from ijson.items(f, prefix)

# Copied from kingfisher-summarize
Expand Down
9 changes: 6 additions & 3 deletions kingfisher_scrapy/extensions/files_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math
import os
from zlib import adler32
import zlib

from scrapy import signals
from scrapy.exceptions import NotConfigured
Expand Down Expand Up @@ -84,10 +84,13 @@ def item_scraped(self, item, spider):

item['path'] = path

# https://github.com/rails/rails/blob/05ed261/activesupport/lib/active_support/cache/file_store.rb#L150-L175
@staticmethod
def _get_hashed_path(file_name):
hash_dir = adler32(file_name.encode())
hash_dir, dir_1 = divmod(hash_dir, 0x1000)
checksum = zlib.adler32(file_name.encode())
checksum, dir_1 = divmod(checksum, 0x1000)
# 0x1000 is 4096, which should be sufficient, without another level.
# dir_2 = checksum % 0x1000
return "%03X" % dir_1

def _write_file(self, path, data):
Expand Down

0 comments on commit 2fa447e

Please sign in to comment.