Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

handle remote->local transfers with local hardlink/copy if possible

Reworked some of the hardlink / same file detection code to be a
little more general purpose.  Now it can be used to detect duplicate
files on either remote or local side.

When transferring remote->local, if we already have a copy (same
md5sum) of a file locally that we would otherwise transfer, don't
transfer, but hardlink it.  Should hardlink not be avaialble (e.g. on
Windows), use shutil.copy2() instead.  This lets us avoid the second
download completely.

_get_filelist_local() grew an initial list argument.  This lets us
avoid copying / merging / updating a bunch of different lists back
into one - it starts as one list and grows.  Much cleaner (and the
fact these were separate cost me several hours of debugging to track
down why something would get set, like the by_md5 hash, only to have
it be empty shortly thereafter.
  • Loading branch information...
commit f881b162d24c3e318a3e32bd1c4e11500c1d35b9 1 parent cdf25f9
Matt Domsch authored committed

Showing 3 changed files with 35 additions and 36 deletions. Show diff stats Hide diff stats

  1. +14 12 S3/FileLists.py
  2. +8 23 S3/SortedDict.py
  3. +13 1 s3cmd
26 S3/FileLists.py
@@ -137,7 +137,7 @@ def handle_exclude_include_walk(root, dirs, files):
137 137 debug(u"PASS: %s" % (file))
138 138
139 139 def fetch_local_list(args, recursive = None):
140   - def _get_filelist_local(local_uri):
  140 + def _get_filelist_local(loc_list, local_uri):
141 141 info(u"Compiling list of local files...")
142 142 if local_uri.isdir():
143 143 local_base = deunicodise(local_uri.basename())
@@ -149,7 +149,6 @@ def _get_filelist_local(local_uri):
149 149 local_path = deunicodise(local_uri.dirname())
150 150 filelist = [( local_path, [], [deunicodise(local_uri.basename())] )]
151 151 single_file = True
152   - loc_list = SortedDict(ignore_case = False)
153 152 for root, dirs, files in filelist:
154 153 rel_root = root.replace(local_path, local_base, 1)
155 154 for f in files:
@@ -173,7 +172,6 @@ def _get_filelist_local(local_uri):
173 172 'full_name' : full_name,
174 173 'size' : sr.st_size,
175 174 'mtime' : sr.st_mtime,
176   - 'nlink' : sr.st_nlink, # record hardlink information
177 175 'dev' : sr.st_dev,
178 176 'inode' : sr.st_ino,
179 177 'uid' : sr.st_uid,
@@ -181,7 +179,9 @@ def _get_filelist_local(local_uri):
181 179 'sr': sr # save it all, may need it in preserve_attrs_list
182 180 ## TODO: Possibly more to save here...
183 181 }
184   - loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino)
  182 + if 'md5' in cfg.sync_checks:
  183 + md5 = loc_list.get_md5(relative_file)
  184 + loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5)
185 185 return loc_list, single_file
186 186
187 187 cfg = Config()
@@ -204,8 +204,7 @@ def _get_filelist_local(local_uri):
204 204 local_uris.append(uri)
205 205
206 206 for uri in local_uris:
207   - list_for_uri, single_file = _get_filelist_local(uri)
208   - local_list.update(list_for_uri)
  207 + list_for_uri, single_file = _get_filelist_local(local_list, uri)
209 208
210 209 ## Single file is True if and only if the user
211 210 ## specified one local URI and that URI represents
@@ -264,7 +263,6 @@ def _get_filelist_remote(remote_uri, recursive = True):
264 263 'object_key' : object['Key'],
265 264 'object_uri_str' : object_uri_str,
266 265 'base_uri' : remote_uri,
267   - 'nlink' : 1, # S3 doesn't support hardlinks itself
268 266 'dev' : None,
269 267 'inode' : None,
270 268 }
@@ -406,7 +404,7 @@ def _compare(src_list, dst_lst, src_remote, dst_remote, file):
406 404 debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote)))
407 405
408 406 for relative_file in src_list.keys():
409   - debug(u"CHECK: %s: %s" % (relative_file, src_list.get_md5(relative_file)))
  407 + debug(u"CHECK: %s" % (relative_file))
410 408
411 409 if dst_list.has_key(relative_file):
412 410 ## Was --skip-existing requested?
@@ -416,7 +414,14 @@ def _compare(src_list, dst_lst, src_remote, dst_remote, file):
416 414 del(dst_list[relative_file])
417 415 continue
418 416
419   - if _compare(src_list, dst_list, src_remote, dst_remote, relative_file):
  417 + try:
  418 + compare_result = _compare(src_list, dst_list, src_remote, dst_remote, relative_file)
  419 + except (IOError,OSError), e:
  420 + del(src_list[relative_file])
  421 + del(dst_list[relative_file])
  422 + continue
  423 +
  424 + if compare_result:
420 425 debug(u"IGNR: %s (transfer not needed)" % relative_file)
421 426 del(src_list[relative_file])
422 427 del(dst_list[relative_file])
@@ -434,7 +439,6 @@ def _compare(src_list, dst_lst, src_remote, dst_remote, file):
434 439 else:
435 440 # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
436 441 # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
437   - debug(u"REMOTE COPY src before")
438 442 dst_list.record_md5(relative_file, md5)
439 443 update_list[relative_file] = src_list[relative_file]
440 444 del src_list[relative_file]
@@ -448,7 +452,6 @@ def _compare(src_list, dst_lst, src_remote, dst_remote, file):
448 452 if dst1 is not None:
449 453 # Found one, we want to copy
450 454 debug(u"REMOTE COPY dst: %s -> %s" % (dst1, relative_file))
451   - # FIXME this blows up when dst1 is not in dst_list, because we added it below in record_md5 but it's not really in dst_list.
452 455 copy_pairs.append((dst1, relative_file))
453 456 del(src_list[relative_file])
454 457 else:
@@ -456,7 +459,6 @@ def _compare(src_list, dst_lst, src_remote, dst_remote, file):
456 459 # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
457 460 # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
458 461 dst_list.record_md5(relative_file, md5)
459   - debug(u"REMOTE COPY dst before")
460 462
461 463 for f in dst_list.keys():
462 464 if not src_list.has_key(f) and not update_list.has_key(f):
31 S3/SortedDict.py
@@ -27,7 +27,7 @@ def __init__(self, mapping = {}, ignore_case = True, **kwargs):
27 27 """
28 28 dict.__init__(self, mapping, **kwargs)
29 29 self.ignore_case = ignore_case
30   - self.hardlinks = dict()
  30 + self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}}
31 31 self.by_md5 = dict() # {md5: set(relative_files)}
32 32
33 33 def keys(self):
@@ -60,39 +60,24 @@ def find_md5_one(self, md5):
60 60 except:
61 61 return None
62 62
63   -
64 63 def get_md5(self, relative_file):
65 64 md5 = None
66 65 if 'md5' in self[relative_file]:
67 66 return self[relative_file]['md5']
68   - if self.is_hardlinked(relative_file): # speedup by getting it from one of the hardlinks already processed
69   - md5 = self.get_hardlink_md5(relative_file)
70   - if md5 is None:
71   - md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
72   - self.record_md5(relative_file, md5)
73   - self.set_hardlink_md5(relative_file, md5)
74   - else:
75   - md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
76   - self[relative_file]['md5'] = md5
77   - l.record_md5(relative_file, md5)
  67 + md5 = self.get_hardlink_md5(relative_file)
  68 + if md5 is None:
  69 + md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
  70 + self.record_md5(relative_file, md5)
  71 + self[relative_file]['md5'] = md5
78 72 return md5
79 73
80   - def record_hardlink(self, relative_file, dev, inode):
  74 + def record_hardlink(self, relative_file, dev, inode, md5):
81 75 if dev not in self.hardlinks:
82 76 self.hardlinks[dev] = dict()
83 77 if inode not in self.hardlinks[dev]:
84   - self.hardlinks[dev][inode] = dict(md5=None, relative_files=set())
  78 + self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set())
85 79 self.hardlinks[dev][inode]['relative_files'].add(relative_file)
86 80
87   - def set_hardlink_md5(self, relative_file, md5):
88   - dev = self[relative_file]['dev']
89   - inode = self[relative_file]['inode']
90   - self.record_hardlink(relative_file, dev, inode)
91   - self.hardlinks[dev][inode]['md5'] = md5
92   -
93   - def is_hardlinked(self, relative_file):
94   - return self[relative_file]['nlink'] > 1
95   -
96 81 def get_hardlink_md5(self, relative_file):
97 82 md5 = None
98 83 dev = self[relative_file]['dev']
14 s3cmd
@@ -23,6 +23,7 @@ import locale
23 23 import subprocess
24 24 import htmlentitydefs
25 25 import socket
  26 +import shutil
26 27
27 28 from copy import copy
28 29 from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
@@ -700,8 +701,9 @@ def cmd_sync_remote2local(args):
700 701 local_count = len(local_list)
701 702 remote_count = len(remote_list)
702 703 update_count = len(update_list)
  704 + copy_pairs_count = len(copy_pairs)
703 705
704   - info(u"Summary: %d remote files to download, %d local files to delete" % (remote_count + update_count, local_count))
  706 + info(u"Summary: %d remote files to download, %d local files to delete, %d local files to hardlink" % (remote_count + update_count, local_count, copy_pairs_count))
705 707
706 708 def _set_local_filename(remote_list, destination_base):
707 709 if not os.path.isdir(destination_base):
@@ -826,6 +828,7 @@ def cmd_sync_remote2local(args):
826 828 seq = 0
827 829 seq, total_size = _download(remote_list, seq, remote_count + update_count, total_size, dir_cache)
828 830 seq, total_size = _download(update_list, seq, remote_count + update_count, total_size, dir_cache)
  831 + local_hardlink(copy_pairs, destination_base)
829 832
830 833 total_elapsed = time.time() - timestamp_start
831 834 speed_fmt = formatSize(total_size/total_elapsed, human_readable = True, floating_point = True)
@@ -841,6 +844,15 @@ def cmd_sync_remote2local(args):
841 844 if cfg.delete_removed and cfg.delete_after:
842 845 _do_deletes(local_list)
843 846
  847 +def local_hardlink(copy_pairs, destination_base):
  848 + for (dst1, dst2) in copy_pairs:
  849 + try:
  850 + os.link(destination_base + dst1, destination_base + dst2)
  851 + debug(u"Hardlinking %s to %s" % (destination_base + dst1, destination_base + dst2))
  852 + except:
  853 + shutil.copy2(destination_base + dst1, destination_base + dst2)
  854 + debug(u"Hardlinking unavailable, copying %s to %s" % (destination_base + dst1, destination_base + dst2))
  855 +
844 856 def remote_copy(s3, copy_pairs, destination_base):
845 857 saved_bytes = 0
846 858 for (dst1, dst2) in copy_pairs:

0 comments on commit f881b16

Please sign in to comment.
Something went wrong with that request. Please try again.