Permalink
Browse files

Merge pull request #15 from vzima/unicode

Resolve unicode problems
  • Loading branch information...
2 parents e660b4b + 5095fd1 commit b19e82287349c71b2a9b05d99d5bdfe807c680b7 @willnorris willnorris committed Jul 4, 2012
View
31 openid/message.py
@@ -273,11 +273,12 @@ def toPostArgs(self):
ns_key = 'openid.ns'
else:
ns_key = 'openid.ns.' + alias
- args[ns_key] = ns_uri
+ args[ns_key] = oidutil.toUnicode(ns_uri).encode('UTF-8')
for (ns_uri, ns_key), value in self.args.iteritems():
key = self.getKey(ns_uri, ns_key)
- args[key] = value.encode('UTF-8')
+ # Ensure the resulting value is an UTF-8 encoded bytestring.
+ args[key] = oidutil.toUnicode(value).encode('UTF-8')
return args
@@ -298,7 +299,7 @@ def toArgs(self):
return kvargs
def toFormMarkup(self, action_url, form_tag_attrs=None,
- submit_text="Continue"):
+ submit_text=u"Continue"):
"""Generate HTML form markup that contains the values in this
message, to be HTTP POSTed as x-www-form-urlencoded UTF-8.
@@ -324,28 +325,28 @@ def toFormMarkup(self, action_url, form_tag_attrs=None,
assert action_url is not None
- form = ElementTree.Element('form')
+ form = ElementTree.Element(u'form')
if form_tag_attrs:
for name, attr in form_tag_attrs.iteritems():
form.attrib[name] = attr
- form.attrib['action'] = action_url
- form.attrib['method'] = 'post'
- form.attrib['accept-charset'] = 'UTF-8'
- form.attrib['enctype'] = 'application/x-www-form-urlencoded'
+ form.attrib[u'action'] = oidutil.toUnicode(action_url)
+ form.attrib[u'method'] = u'post'
+ form.attrib[u'accept-charset'] = u'UTF-8'
+ form.attrib[u'enctype'] = u'application/x-www-form-urlencoded'
for name, value in self.toPostArgs().iteritems():
- attrs = {'type': 'hidden',
- 'name': name,
- 'value': value}
- form.append(ElementTree.Element('input', attrs))
+ attrs = {u'type': u'hidden',
+ u'name': oidutil.toUnicode(name),
+ u'value': oidutil.toUnicode(value)}
+ form.append(ElementTree.Element(u'input', attrs))
- submit = ElementTree.Element(
- 'input', {'type':'submit', 'value':submit_text})
+ submit = ElementTree.Element(u'input',
+ {u'type':'submit', u'value':oidutil.toUnicode(submit_text)})
form.append(submit)
- return ElementTree.tostring(form)
+ return ElementTree.tostring(form, encoding='utf-8')
def toURL(self, base_url):
"""Generate a GET URL with the parameters in this message
View
14 openid/oidutil.py
@@ -5,7 +5,7 @@
interesting.
"""
-__all__ = ['log', 'appendArgs', 'toBase64', 'fromBase64', 'autoSubmitHTML']
+__all__ = ['log', 'appendArgs', 'toBase64', 'fromBase64', 'autoSubmitHTML', 'toUnicode']
import binascii
import sys
@@ -21,6 +21,18 @@
'elementtree.ElementTree',
]
+def toUnicode(value):
+ """Returns the given argument as a unicode object.
+
+ @param value: A UTF-8 encoded string or a unicode (coercable) object
+ @type message: str or unicode
+
+ @returns: Unicode object representing the input value.
+ """
+ if isinstance(value, str):
+ return value.decode('utf-8')
+ return unicode(value)
+
def autoSubmitHTML(form, title='OpenID transaction in progress'):
return """
<html>
View
9 openid/test/data/test_discover/unicode.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <title param="ěščřžýáíé &raquo;">Title with param that needs decoding</title>
+ </head>
+ <body>
+ <p>This page can be properly decoded and everything will will be fine</p>
+ </body>
+</html>
View
17 openid/test/data/test_discover/unicode2.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <title param="¿¿¿¿¿ýáíé &raquo;">Title with param which raises UnicodeError</title>
+ <meta http-equiv="X-XRDS-Location" content="http://someuser.unittest/xrds" />
+ </head>
+ <body>
+ <p>
+ weird sign Å to prevent successful decoding
+ </p>
+ <p>
+ This page can not be properly decoded so its content will be passed to HTML parser
+ encoded but title raises UnicodeError because x-xrds-location is not found on time
+ </p>
+ </body>
+</html>
+
View
17 openid/test/data/test_discover/unicode3.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <meta http-equiv="X-XRDS-Location" content="http://someuser.unittest/xrds" />
+ <title param="¿¿¿¿¿ýáíé &raquo;">Title with param which raises UnicodeError</title>
+ </head>
+ <body>
+ <p>
+ weird sign Å to prevent successful decoding
+ </p>
+ <p>
+ This page can not be properly decoded so its content will be passed to HTML parser
+ encoded but service will be found because x-xrds-location is found on time
+ </p>
+ </body>
+</html>
+
View
12 openid/test/oidutil.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
import unittest
import codecs
import string
@@ -53,7 +54,17 @@ def runTest(self):
def shortDescription(self):
return self.desc
+class TestUnicodeConversion(unittest.TestCase):
+ def test_toUnicode(self):
+ # Unicode objects pass through
+ self.failUnless(isinstance(oidutil.toUnicode(u'fööbär'), unicode))
+ self.assertEquals(oidutil.toUnicode(u'fööbär'), u'fööbär')
+ # UTF-8 encoded string are decoded
+ self.failUnless(isinstance(oidutil.toUnicode('fööbär'), unicode))
+ self.assertEquals(oidutil.toUnicode('fööbär'), u'fööbär')
+ # Other encodings raise exceptions
+ self.assertRaises(UnicodeDecodeError, lambda: oidutil.toUnicode(u'fööbär'.encode('latin-1')))
class TestSymbol(unittest.TestCase):
def testCopyHash(self):
@@ -154,6 +165,7 @@ def buildAppendTests():
def pyUnitTests():
some = buildAppendTests()
some.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(TestSymbol))
+ some.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(TestUnicodeConversion))
return some
def test_appendArgs():
View
32 openid/test/test_discover.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
import sys
import unittest
import datadriven
@@ -248,6 +249,37 @@ def test_404(self):
self.failUnlessRaises(DiscoveryFailure,
discover.discover, self.id_url + '/404')
+ def test_unicode(self):
+ """
+ Check page with unicode and HTML entities
+ """
+ self._discover(
+ content_type='text/html;charset=utf-8',
+ data=readDataFile('unicode.html'),
+ expected_services=0)
+
+ def test_unicode_undecodable_html(self):
+ """
+ Check page with unicode and HTML entities that can not be decoded
+ """
+ data = readDataFile('unicode2.html')
+ self.failUnlessRaises(UnicodeDecodeError, data.decode, 'utf-8')
+ self._discover(content_type='text/html;charset=utf-8',
+ data=data, expected_services=0)
+
+ def test_unicode_undecodable_html2(self):
+ """
+ Check page with unicode and HTML entities that can not be decoded
+ but xrds document is found before it matters
+ """
+ self.documents[self.id_url + 'xrds'] = (
+ 'application/xrds+xml', readDataFile('yadis_idp.xml'))
+
+ data = readDataFile('unicode3.html')
+ self.failUnlessRaises(UnicodeDecodeError, data.decode, 'utf-8')
+ self._discover(content_type='text/html;charset=utf-8',
+ data=data, expected_services=1)
+
def test_noOpenID(self):
services = self._discover(content_type='text/plain',
data="junk",
View
39 openid/test/test_message.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from openid import message
from openid import oidutil
from openid.extensions import sreg
@@ -445,7 +446,6 @@ def test_toURL(self):
def test_isOpenID1(self):
self.failUnless(self.msg.isOpenID1())
-
class OpenID2MessageTest(unittest.TestCase):
def setUp(self):
self.msg = message.Message.fromPostArgs({'openid.mode':'error',
@@ -462,6 +462,20 @@ def test_toPostArgs(self):
'xey': 'value',
})
+ def test_toPostArgs_bug_with_utf8_encoded_values(self):
+ msg = message.Message.fromPostArgs({'openid.mode':'error',
+ 'openid.error':'unit test',
+ 'openid.ns':message.OPENID2_NS
+ })
+ msg.setArg(message.BARE_NS, 'ünicöde_key', 'ünicöde_välüe')
+ self.failUnlessEqual(msg.toPostArgs(),
+ {'openid.mode':'error',
+ 'openid.error':'unit test',
+ 'openid.ns':message.OPENID2_NS,
+ 'ünicöde_key': 'ünicöde_välüe',
+ })
+
+
def test_toArgs(self):
# This method can't tolerate BARE_NS.
self.msg.delArg(message.BARE_NS, "xey")
@@ -846,6 +860,29 @@ def test_toFormMarkup(self):
self._checkForm(html, m, self.action_url,
self.form_tag_attrs, self.submit_text)
+ def test_toFormMarkup_bug_with_utf8_values(self):
+ postargs = {
+ 'openid.ns': message.OPENID2_NS,
+ 'openid.mode': 'checkid_setup',
+ 'openid.identity': 'http://bogus.example.invalid:port/',
+ 'openid.assoc_handle': 'FLUB',
+ 'openid.return_to': 'Neverland',
+ 'ünicöde_key' : 'ünicöde_välüe',
+ }
+ m = message.Message.fromPostArgs(postargs)
+ # Calling m.toFormMarkup with lxml used for ElementTree will throw
+ # a ValueError.
+ html = m.toFormMarkup(self.action_url, self.form_tag_attrs,
+ self.submit_text)
+ # Using the (c)ElementTree from stdlib will result in the UTF-8
+ # encoded strings to be converted to XML character references,
+ # "ünicöde_key" becomes "&#195;&#188;nic&#195;&#182;de_key" and
+ # "ünicöde_välüe" becomes "&#195;&#188;nic&#195;&#182;de_v&#195;&#164;l&#195;&#188;e"
+ self.failIf('&#195;&#188;nic&#195;&#182;de_key' in html,
+ 'UTF-8 bytes should not convert to XML character references')
+ self.failIf('&#195;&#188;nic&#195;&#182;de_v&#195;&#164;l&#195;&#188;e' in html,
+ 'UTF-8 bytes should not convert to XML character references')
+
def test_overrideMethod(self):
"""Be sure that caller cannot change form method to GET."""
m = message.Message.fromPostArgs(self.postargs)
View
26 openid/yadis/discover.py
@@ -1,7 +1,7 @@
# -*- test-case-name: openid.test.test_yadis_discover -*-
__all__ = ['discover', 'DiscoveryResult', 'DiscoveryFailure']
-from cStringIO import StringIO
+from StringIO import StringIO
from openid import fetchers
@@ -45,6 +45,8 @@ def __init__(self, request_uri):
def usedYadisLocation(self):
"""Was the Yadis protocol's indirection used?"""
+ if self.xrds_uri is None:
+ return False
return self.normalized_uri != self.xrds_uri
def isXRDS(self):
@@ -126,9 +128,27 @@ def whereIsYadis(resp):
# XXX: do we want to do something with content-type, like
# have a whitelist or a blacklist (for detecting that it's
# HTML)?
+
+ # Decode body by encoding of file
+ content_type = content_type or ''
+ encoding = content_type.rsplit(';', 1)
+ if len(encoding) == 2 and encoding[1].strip().startswith('charset='):
+ encoding = encoding[1].split('=', 1)[1].strip()
+ else:
+ encoding = 'UTF-8'
+
+ try:
+ content = resp.body.decode(encoding)
+ except UnicodeError:
+ # Keep encoded version in case yadis location can be found before encoding shut this up.
+ # Possible errors will be caught lower.
+ content = resp.body
+
try:
- yadis_loc = findHTMLMeta(StringIO(resp.body))
- except MetaNotFound:
+ yadis_loc = findHTMLMeta(StringIO(content))
+ except (MetaNotFound, UnicodeError):
+ # UnicodeError: Response body could not be encoded and xrds location
+ # could not be found before troubles occurs.
pass
return yadis_loc

0 comments on commit b19e822

Please sign in to comment.