Skip to content

Commit

Permalink
* s3cmd: New [fixbucket] command for fixing invalid object
Browse files Browse the repository at this point in the history
  names in a given Bucket. For instance names with  in
  them (not sure how people manage to upload them but they do).
* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for 
  the above, plus advise user to run 'fixbucket' when XML parsing 
  fails.
* NEWS: Updated.



git-svn-id: http://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@395 830e0280-6d2a-0410-9c65-932aecc39d9d
  • Loading branch information
ludvigm committed Jun 2, 2009
1 parent 3572723 commit 738388a
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 22 deletions.
10 changes: 10 additions & 0 deletions ChangeLog
@@ -1,3 +1,13 @@
2009-06-02 Michal Ludvig <michal@logix.cz>

* s3cmd: New [fixbucket] command for fixing invalid object
names in a given Bucket. For instance names with &#x08; in
them (not sure how people manage to upload them but they do).
* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for
the above, plus advise user to run 'fixbucket' when XML parsing
fails.
* NEWS: Updated.

2009-05-29 Michal Ludvig <michal@logix.cz>

* S3/Utils.py: New function replace_nonprintables()
Expand Down
3 changes: 3 additions & 0 deletions NEWS
Expand Up @@ -10,6 +10,9 @@ s3cmd 1.0.0
* Added --exclude/--include and --dry-run for [del], [setacl].
* Neutralise characters that are invalid in XML to avoid ExpatErrors.
http://boodebr.org/main/python/all-about-python-and-unicode
* New command [fixbucket] for for fixing invalid object names
in a given Bucket. For instance names with &#x08; in them
(not sure how people manage to upload them but they do).

s3cmd 0.9.9 - 2009-02-17
===========
Expand Down
2 changes: 1 addition & 1 deletion S3/Config.py
Expand Up @@ -68,7 +68,7 @@ class Config(object):
debug_exclude = {}
debug_include = {}
encoding = "utf-8"
verbatim = False
urlencoding_mode = "normal"

## Creating a singleton
def __new__(self, configfile = None):
Expand Down
37 changes: 23 additions & 14 deletions S3/S3.py
Expand Up @@ -174,26 +174,29 @@ def _get_common_prefixes(data):
return getListFromXml(data, "CommonPrefixes")

uri_params = {}
if prefix:
uri_params['prefix'] = self.urlencode_string(prefix)
if not self.config.recursive and not recursive:
uri_params['delimiter'] = "/"
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
response = self.send_request(request)
#debug(response)
response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
list = _get_contents(response["data"])
prefixes = _get_common_prefixes(response["data"])
while _list_truncated(response["data"]):
uri_params['marker'] = self.urlencode_string(list[-1]["Key"])
debug("Listing continues after '%s'" % uri_params['marker'])
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
response = self.send_request(request)
response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
list += _get_contents(response["data"])
prefixes += _get_common_prefixes(response["data"])
response['list'] = list
response['common_prefixes'] = prefixes
return response

def bucket_list_noparse(self, bucket, prefix = None, recursive = None, uri_params = {}):
if prefix:
uri_params['prefix'] = self.urlencode_string(prefix)
if not self.config.recursive and not recursive:
uri_params['delimiter'] = "/"
request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
response = self.send_request(request)
#debug(response)
return response

def bucket_create(self, bucket, bucket_location = None):
headers = SortedDict(ignore_case = True)
body = ""
Expand Down Expand Up @@ -320,11 +323,14 @@ def set_acl(self, uri, acl):
return response

## Low level methods
def urlencode_string(self, string):
def urlencode_string(self, string, urlencoding_mode = None):
if type(string) == unicode:
string = string.encode("utf-8")

if self.config.verbatim:
if urlencoding_mode is None:
urlencoding_mode = self.config.urlencoding_mode

if urlencoding_mode == "verbatim":
## Don't do any pre-processing
return string

Expand All @@ -345,9 +351,12 @@ def urlencode_string(self, string):
# [hope that sounds reassuring ;-)]
o = ord(c)
if (o < 0x20 or o == 0x7f):
error(u"Non-printable character 0x%02x in: %s" % (o, string))
error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
encoded += replace_nonprintables(c)
if urlencoding_mode == "fixbucket":
encoded += "%%%02X" % o
else:
error(u"Non-printable character 0x%02x in: %s" % (o, string))
error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
encoded += replace_nonprintables(c)
elif (o == 0x20 or # Space and below
o == 0x22 or # "
o == 0x23 or # #
Expand Down
16 changes: 11 additions & 5 deletions S3/Utils.py
Expand Up @@ -21,11 +21,13 @@
from logging import debug, info, warning, error

import Config
import Exceptions

try:
import xml.etree.ElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
from xml.parsers.expat import ExpatError

def parseNodes(nodes):
## WARNING: Ignores text nodes from mixed xml/text.
Expand Down Expand Up @@ -57,10 +59,14 @@ def stripNameSpace(xml):

def getTreeFromXml(xml):
xml, xmlns = stripNameSpace(xml)
tree = ET.fromstring(xml)
if xmlns:
tree.attrib['xmlns'] = xmlns
return tree
try:
tree = ET.fromstring(xml)
if xmlns:
tree.attrib['xmlns'] = xmlns
return tree
except ExpatError, e:
error(e)
raise Exceptions.ParameterError("Bucket contains invalid filenames. Please run: s3cmd fixbucket s3://your-bucket/")

def getListFromXml(xml, node):
tree = getTreeFromXml(xml)
Expand Down Expand Up @@ -275,7 +281,7 @@ def replace_nonprintables(string):
modified += 1
else:
new_string += c
if modified:
if modified and Config.Config().urlencoding_mode != "fixbucket":
warning("%d non-printable characters replaced in: %s" % (modified, new_string))
return new_string

Expand Down
71 changes: 69 additions & 2 deletions s3cmd
Expand Up @@ -21,6 +21,7 @@ import traceback
import codecs
import locale
import subprocess
import htmlentitydefs

from copy import copy
from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
Expand Down Expand Up @@ -638,7 +639,7 @@ def _get_filelist_local(local_uri):
## for now skip over
continue
relative_file = unicodise(os.path.join(rel_root, f))
if not cfg.verbatim:
if cfg.urlencoding_mode == "normal":
relative_file = replace_nonprintables(relative_file)
if relative_file.startswith('./'):
relative_file = relative_file[2:]
Expand Down Expand Up @@ -1117,6 +1118,71 @@ def cmd_sign(args):
signature = Utils.sign_string(string_to_sign)
output("Signature: %s" % signature)

def cmd_fixbucket(args):
def _unescape(text):
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
#
# From: http://effbot.org/zone/re-sub.htm#unescape-html
def _unescape_fixup(m):
text = m.group(0)
if not htmlentitydefs.name2codepoint.has_key('apos'):
htmlentitydefs.name2codepoint['apos'] = ord("'")
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", _unescape_fixup, text)

cfg.urlencoding_mode = "fixbucket"
s3 = S3(cfg)

count = 0
for arg in args:
culprit = S3Uri(arg)
if culprit.type != "s3":
raise ParameterError("Expecting S3Uri instead of: %s" % arg)
response = s3.bucket_list_noparse(culprit.bucket(), culprit.object(), recursive = True)
r_xent = re.compile("&#x[\da-fA-F]+;")
keys = re.findall("<Key>(.*?)</Key>", response['data'], re.MULTILINE)
debug("Keys: %r" % keys)
for key in keys:
if r_xent.search(key):
info("Fixing: %s" % key)
debug("Step 1: Transforming %s" % key)
key_bin = _unescape(key)
debug("Step 2: ... to %s" % key_bin)
key_new = replace_nonprintables(key_bin)
debug("Step 3: ... then to %s" % key_new)
src = S3Uri("s3://%s/%s" % (culprit.bucket(), key_bin))
dst = S3Uri("s3://%s/%s" % (culprit.bucket(), key_new))
resp_move = s3.object_move(src, dst)
if resp_move['status'] == 200:
output("File %r renamed to %s" % (key_bin, key_new))
count += 1
else:
error("Something went wrong for: %r" % key)
error("Please report the problem to s3tools-bugs@lists.sourceforge.net")
if count > 0:
warning("Fixed %d files' names. Their ACL were reset to Private." % count)
warning("Use 's3cmd setacl --acl-public s3://...' to make")
warning("them publicly readable if required.")

def resolve_list(lst, args):
retval = []
for item in lst:
Expand Down Expand Up @@ -1351,6 +1417,7 @@ def get_commands_list():
{"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2},
{"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1},
{"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1},
{"cmd":"fixbucket", "label":"Fix invalid file names in a bucket", "param":"s3://BUCKET[/PREFIX]", "func":cmd_fixbucket, "argc":1},

## CloudFront commands
{"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0},
Expand Down Expand Up @@ -1445,7 +1512,7 @@ def main():
optparser.add_option( "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.")

optparser.add_option( "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding)
optparser.add_option( "--verbatim", dest="verbatim", action="store_true", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")
optparser.add_option( "--verbatim", dest="urlencoding_mode", action="store_const", const="verbatim", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")

optparser.add_option( "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).")
optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).")
Expand Down

0 comments on commit 738388a

Please sign in to comment.