Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

add local tree MD5 caching

This creates and maintains a cache (aka HashCache) of each inode in
the local tree.  This is used to avoid doing local disk I/O to
calculate an MD5 value for a file if it's inode, mtime, and size
haven't changed.  If these values have changed, then it does the disk
I/O.

This introduces command line option --cache-file <foo>.  The file is
created if it does not exist, is read upon start and written upon
close. The contents are only useful for a given directory tree, so
caches should not be reused for different directory tree syncs.
  • Loading branch information...
commit 11e5755e38c464b29e4935642ef590b4c7813ce3 1 parent 7de0789
@mdomsch authored committed
Showing with 79 additions and 3 deletions.
  1. +1 −0  S3/Config.py
  2. +25 −3 S3/FileLists.py
  3. +52 −0 S3/HashCache.py
  4. +1 −0  s3cmd
View
1  S3/Config.py
@@ -87,6 +87,7 @@ class Config(object):
website_error = ""
website_endpoint = "http://%(bucket)s.s3-website-%(location)s.amazonaws.com/"
additional_destinations = []
+ cache_file = ""
## Creating a singleton
def __new__(self, configfile = None):
View
28 S3/FileLists.py
@@ -9,6 +9,7 @@
from SortedDict import SortedDict
from Utils import *
from Exceptions import ParameterError
+from HashCache import HashCache
from logging import debug, info, warning, error
@@ -137,7 +138,7 @@ def handle_exclude_include_walk(root, dirs, files):
debug(u"PASS: %s" % (file))
def fetch_local_list(args, recursive = None):
- def _get_filelist_local(loc_list, local_uri):
+ def _get_filelist_local(loc_list, local_uri, cache):
info(u"Compiling list of local files...")
if local_uri.isdir():
local_base = deunicodise(local_uri.basename())
@@ -180,11 +181,30 @@ def _get_filelist_local(loc_list, local_uri):
## TODO: Possibly more to save here...
}
if 'md5' in cfg.sync_checks:
- md5 = loc_list.get_md5(relative_file)
+ md5 = cache.md5(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size)
+ if md5 is None:
+ md5 = loc_list.get_md5(relative_file) # this does the file I/O
+ cache.add(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size, md5)
loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5)
return loc_list, single_file
+ def _maintain_cache(cache, local_list):
+ if cfg.cache_file:
+ cache.mark_all_for_purge()
+ for i in local_list.keys():
+ cache.unmark_for_purge(local_list[i]['dev'], local_list[i]['inode'], local_list[i]['mtime'], local_list[i]['size'])
+ cache.purge()
+ cache.save(cfg.cache_file)
+
cfg = Config()
+
+ cache = HashCache()
+ if cfg.cache_file:
+ try:
+ cache.load(cfg.cache_file)
+ except IOError:
+ info(u"No cache file found, creating it.")
+
local_uris = []
local_list = SortedDict(ignore_case = False)
single_file = False
@@ -204,7 +224,7 @@ def _get_filelist_local(loc_list, local_uri):
local_uris.append(uri)
for uri in local_uris:
- list_for_uri, single_file = _get_filelist_local(local_list, uri)
+ list_for_uri, single_file = _get_filelist_local(local_list, uri, cache)
## Single file is True if and only if the user
## specified one local URI and that URI represents
@@ -214,6 +234,8 @@ def _get_filelist_local(loc_list, local_uri):
if len(local_list) > 1:
single_file = False
+ _maintain_cache(cache, local_list)
+
return local_list, single_file
def fetch_remote_list(args, require_attribs = False, recursive = None):
View
52 S3/HashCache.py
@@ -0,0 +1,52 @@
+import cPickle as pickle
+
+class HashCache(object):
+ def __init__(self):
+ self.inodes = dict()
+
+ def add(self, dev, inode, mtime, size, md5):
+ if dev not in self.inodes:
+ self.inodes[dev] = dict()
+ if inode not in self.inodes[dev]:
+ self.inodes[dev][inode] = dict()
+ self.inodes[dev][inode][mtime] = dict(md5=md5, size=size)
+
+ def md5(self, dev, inode, mtime, size):
+ try:
+ d = self.inodes[dev][inode][mtime]
+ if d['size'] != size:
+ return None
+ except:
+ return None
+ return d['md5']
+
+ def mark_all_for_purge(self):
+ for d in self.inodes.keys():
+ for i in self.inodes[d].keys():
+ for c in self.inodes[d][i].keys():
+ self.inodes[d][i][c]['purge'] = True
+
+ def unmark_for_purge(self, dev, inode, mtime, size):
+ d = self.inodes[dev][inode][mtime]
+ if d['size'] == size and 'purge' in d:
+ del self.inodes[dev][inode][mtime]['purge']
+
+ def purge(self):
+ for d in self.inodes.keys():
+ for i in self.inodes[d].keys():
+ for m in self.inodes[d][i].keys():
+ if 'purge' in self.inodes[d][i][m]:
+ del self.inodes[d][i]
+
+ def save(self, f):
+ d = dict(inodes=self.inodes, version=1)
+ f = open(f, 'w')
+ p = pickle.dump(d, f)
+ f.close()
+
+ def load(self, f):
+ f = open(f, 'r')
+ d = pickle.load(f)
+ f.close()
+ if d.get('version') == 1 and 'inodes' in d:
+ self.inodes = d['inodes']
View
1  s3cmd
@@ -1671,6 +1671,7 @@ def main():
optparser.add_option("-d", "--debug", dest="verbosity", action="store_const", const=logging.DEBUG, help="Enable debug output.")
optparser.add_option( "--version", dest="show_version", action="store_true", help="Show s3cmd version (%s) and exit." % (PkgInfo.version))
optparser.add_option("-F", "--follow-symlinks", dest="follow_symlinks", action="store_true", default=False, help="Follow symbolic links as if they are regular files")
+ optparser.add_option( "--cache-file", dest="cache_file", action="store", default="", metavar="FILE", help="Cache FILE containing local source MD5 values")
optparser.set_usage(optparser.usage + " COMMAND [parameters]")
optparser.set_description('S3cmd is a tool for managing objects in '+
Please sign in to comment.
Something went wrong with that request. Please try again.