Skip to content
This repository has been archived by the owner on Nov 4, 2018. It is now read-only.

Commit

Permalink
* s3cmd: New [fixbucket] command for fixing invalid object
Browse files Browse the repository at this point in the history
  names in a given Bucket. For instance names with  in
  them (not sure how people manage to upload them but they do).
* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for 
  the above, plus advise user to run 'fixbucket' when XML parsing 
  fails.
* NEWS: Updated.



git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@395 830e0280-6d2a-0410-9c65-932aecc39d9d
  • Loading branch information
mludvig committed Jun 2, 2009
1 parent b40dd81 commit 3c07424
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 22 deletions.
10 changes: 10 additions & 0 deletions ChangeLog
Original file line number Original file line Diff line number Diff line change
@@ -1,3 +1,13 @@
2009-06-02 Michal Ludvig <michal@logix.cz>

* s3cmd: New [fixbucket] command for fixing invalid object
names in a given Bucket. For instance names with &#x08; in
them (not sure how people manage to upload them but they do).
* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for
the above, plus advise user to run 'fixbucket' when XML parsing
fails.
* NEWS: Updated.

2009-05-29 Michal Ludvig <michal@logix.cz> 2009-05-29 Michal Ludvig <michal@logix.cz>


* S3/Utils.py: New function replace_nonprintables() * S3/Utils.py: New function replace_nonprintables()
Expand Down
3 changes: 3 additions & 0 deletions NEWS
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ s3cmd 1.0.0
* Added --exclude/--include and --dry-run for [del], [setacl]. * Added --exclude/--include and --dry-run for [del], [setacl].
* Neutralise characters that are invalid in XML to avoid ExpatErrors. * Neutralise characters that are invalid in XML to avoid ExpatErrors.
http://boodebr.org/main/python/all-about-python-and-unicode http://boodebr.org/main/python/all-about-python-and-unicode
* New command [fixbucket] for for fixing invalid object names
in a given Bucket. For instance names with &#x08; in them
(not sure how people manage to upload them but they do).


s3cmd 0.9.9 - 2009-02-17 s3cmd 0.9.9 - 2009-02-17
=========== ===========
Expand Down
2 changes: 1 addition & 1 deletion S3/Config.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class Config(object):
debug_exclude = {} debug_exclude = {}
debug_include = {} debug_include = {}
encoding = "utf-8" encoding = "utf-8"
verbatim = False urlencoding_mode = "normal"


## Creating a singleton ## Creating a singleton
def __new__(self, configfile = None): def __new__(self, configfile = None):
Expand Down
37 changes: 23 additions & 14 deletions S3/S3.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -174,26 +174,29 @@ def _get_common_prefixes(data):
return getListFromXml(data, "CommonPrefixes") return getListFromXml(data, "CommonPrefixes")


uri_params = {} uri_params = {}
if prefix: response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
uri_params['prefix'] = self.urlencode_string(prefix)
if not self.config.recursive and not recursive:
uri_params['delimiter'] = "/"
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
response = self.send_request(request)
#debug(response)
list = _get_contents(response["data"]) list = _get_contents(response["data"])
prefixes = _get_common_prefixes(response["data"]) prefixes = _get_common_prefixes(response["data"])
while _list_truncated(response["data"]): while _list_truncated(response["data"]):
uri_params['marker'] = self.urlencode_string(list[-1]["Key"]) uri_params['marker'] = self.urlencode_string(list[-1]["Key"])
debug("Listing continues after '%s'" % uri_params['marker']) debug("Listing continues after '%s'" % uri_params['marker'])
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params) response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
response = self.send_request(request)
list += _get_contents(response["data"]) list += _get_contents(response["data"])
prefixes += _get_common_prefixes(response["data"]) prefixes += _get_common_prefixes(response["data"])
response['list'] = list response['list'] = list
response['common_prefixes'] = prefixes response['common_prefixes'] = prefixes
return response return response


def bucket_list_noparse(self, bucket, prefix = None, recursive = None, uri_params = {}):
if prefix:
uri_params['prefix'] = self.urlencode_string(prefix)
if not self.config.recursive and not recursive:
uri_params['delimiter'] = "/"
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
response = self.send_request(request)
#debug(response)
return response

def bucket_create(self, bucket, bucket_location = None): def bucket_create(self, bucket, bucket_location = None):
headers = SortedDict(ignore_case = True) headers = SortedDict(ignore_case = True)
body = "" body = ""
Expand Down Expand Up @@ -320,11 +323,14 @@ def set_acl(self, uri, acl):
return response return response


## Low level methods ## Low level methods
def urlencode_string(self, string): def urlencode_string(self, string, urlencoding_mode = None):
if type(string) == unicode: if type(string) == unicode:
string = string.encode("utf-8") string = string.encode("utf-8")


if self.config.verbatim: if urlencoding_mode is None:
urlencoding_mode = self.config.urlencoding_mode

if urlencoding_mode == "verbatim":
## Don't do any pre-processing ## Don't do any pre-processing
return string return string


Expand All @@ -345,9 +351,12 @@ def urlencode_string(self, string):
# [hope that sounds reassuring ;-)] # [hope that sounds reassuring ;-)]
o = ord(c) o = ord(c)
if (o < 0x20 or o == 0x7f): if (o < 0x20 or o == 0x7f):
error(u"Non-printable character 0x%02x in: %s" % (o, string)) if urlencoding_mode == "fixbucket":
error(u"Please report it to s3tools-bugs@lists.sourceforge.net") encoded += "%%%02X" % o
encoded += replace_nonprintables(c) else:
error(u"Non-printable character 0x%02x in: %s" % (o, string))
error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
encoded += replace_nonprintables(c)
elif (o == 0x20 or # Space and below elif (o == 0x20 or # Space and below
o == 0x22 or # " o == 0x22 or # "
o == 0x23 or # # o == 0x23 or # #
Expand Down
16 changes: 11 additions & 5 deletions S3/Utils.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
from logging import debug, info, warning, error from logging import debug, info, warning, error


import Config import Config
import Exceptions


try: try:
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
except ImportError: except ImportError:
import elementtree.ElementTree as ET import elementtree.ElementTree as ET
from xml.parsers.expat import ExpatError


def parseNodes(nodes): def parseNodes(nodes):
## WARNING: Ignores text nodes from mixed xml/text. ## WARNING: Ignores text nodes from mixed xml/text.
Expand Down Expand Up @@ -57,10 +59,14 @@ def stripNameSpace(xml):


def getTreeFromXml(xml): def getTreeFromXml(xml):
xml, xmlns = stripNameSpace(xml) xml, xmlns = stripNameSpace(xml)
tree = ET.fromstring(xml) try:
if xmlns: tree = ET.fromstring(xml)
tree.attrib['xmlns'] = xmlns if xmlns:
return tree tree.attrib['xmlns'] = xmlns
return tree
except ExpatError, e:
error(e)
raise Exceptions.ParameterError("Bucket contains invalid filenames. Please run: s3cmd fixbucket s3://your-bucket/")


def getListFromXml(xml, node): def getListFromXml(xml, node):
tree = getTreeFromXml(xml) tree = getTreeFromXml(xml)
Expand Down Expand Up @@ -275,7 +281,7 @@ def replace_nonprintables(string):
modified += 1 modified += 1
else: else:
new_string += c new_string += c
if modified: if modified and Config.Config().urlencoding_mode != "fixbucket":
warning("%d non-printable characters replaced in: %s" % (modified, new_string)) warning("%d non-printable characters replaced in: %s" % (modified, new_string))
return new_string return new_string


Expand Down
71 changes: 69 additions & 2 deletions s3cmd
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import traceback
import codecs import codecs
import locale import locale
import subprocess import subprocess
import htmlentitydefs


from copy import copy from copy import copy
from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
Expand Down Expand Up @@ -638,7 +639,7 @@ def _get_filelist_local(local_uri):
## for now skip over ## for now skip over
continue continue
relative_file = unicodise(os.path.join(rel_root, f)) relative_file = unicodise(os.path.join(rel_root, f))
if not cfg.verbatim: if cfg.urlencoding_mode == "normal":
relative_file = replace_nonprintables(relative_file) relative_file = replace_nonprintables(relative_file)
if relative_file.startswith('./'): if relative_file.startswith('./'):
relative_file = relative_file[2:] relative_file = relative_file[2:]
Expand Down Expand Up @@ -1117,6 +1118,71 @@ def cmd_sign(args):
signature = Utils.sign_string(string_to_sign) signature = Utils.sign_string(string_to_sign)
output("Signature: %s" % signature) output("Signature: %s" % signature)


def cmd_fixbucket(args):
def _unescape(text):
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
#
# From: http://effbot.org/zone/re-sub.htm#unescape-html
def _unescape_fixup(m):
text = m.group(0)
if not htmlentitydefs.name2codepoint.has_key('apos'):
htmlentitydefs.name2codepoint['apos'] = ord("'")
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", _unescape_fixup, text)

cfg.urlencoding_mode = "fixbucket"
s3 = S3(cfg)

count = 0
for arg in args:
culprit = S3Uri(arg)
if culprit.type != "s3":
raise ParameterError("Expecting S3Uri instead of: %s" % arg)
response = s3.bucket_list_noparse(culprit.bucket(), culprit.object(), recursive = True)
r_xent = re.compile("&#x[\da-fA-F]+;")
keys = re.findall("<Key>(.*?)</Key>", response['data'], re.MULTILINE)
debug("Keys: %r" % keys)
for key in keys:
if r_xent.search(key):
info("Fixing: %s" % key)
debug("Step 1: Transforming %s" % key)
key_bin = _unescape(key)
debug("Step 2: ... to %s" % key_bin)
key_new = replace_nonprintables(key_bin)
debug("Step 3: ... then to %s" % key_new)
src = S3Uri("s3://%s/%s" % (culprit.bucket(), key_bin))
dst = S3Uri("s3://%s/%s" % (culprit.bucket(), key_new))
resp_move = s3.object_move(src, dst)
if resp_move['status'] == 200:
output("File %r renamed to %s" % (key_bin, key_new))
count += 1
else:
error("Something went wrong for: %r" % key)
error("Please report the problem to s3tools-bugs@lists.sourceforge.net")
if count > 0:
warning("Fixed %d files' names. Their ACL were reset to Private." % count)
warning("Use 's3cmd setacl --acl-public s3://...' to make")
warning("them publicly readable if required.")

def resolve_list(lst, args): def resolve_list(lst, args):
retval = [] retval = []
for item in lst: for item in lst:
Expand Down Expand Up @@ -1351,6 +1417,7 @@ def get_commands_list():
{"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2}, {"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2},
{"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1}, {"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1},
{"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1}, {"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1},
{"cmd":"fixbucket", "label":"Fix invalid file names in a bucket", "param":"s3://BUCKET[/PREFIX]", "func":cmd_fixbucket, "argc":1},


## CloudFront commands ## CloudFront commands
{"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0}, {"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0},
Expand Down Expand Up @@ -1445,7 +1512,7 @@ def main():
optparser.add_option( "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.") optparser.add_option( "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.")


optparser.add_option( "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding) optparser.add_option( "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding)
optparser.add_option( "--verbatim", dest="verbatim", action="store_true", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!") optparser.add_option( "--verbatim", dest="urlencoding_mode", action="store_const", const="verbatim", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")


optparser.add_option( "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).") optparser.add_option( "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).")
optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).") optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).")
Expand Down

0 comments on commit 3c07424

Please sign in to comment.