diff --git a/lib/openid/consumer/html_parse.rb b/lib/openid/consumer/html_parse.rb index fca39456..e127dbef 100644 --- a/lib/openid/consumer/html_parse.rb +++ b/lib/openid/consumer/html_parse.rb @@ -34,7 +34,15 @@ def OpenID.unescape_hash(h) def OpenID.parse_link_attrs(html) - stripped = html.gsub(REMOVED_RE,'') + begin + stripped = html.gsub(REMOVED_RE,'') + rescue ArgumentError + begin + stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') + rescue Encoding::UndefinedConversionError #needed for a problem in JRuby where it can't handle the conversion + stripped = html.encode('UTF-8', 'ASCII', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') + end + end parser = HTMLTokenizer.new(stripped) links = [] diff --git a/test/test_linkparse.rb b/test/test_linkparse.rb index 6360d507..9a504290 100644 --- a/test/test_linkparse.rb +++ b/test/test_linkparse.rb @@ -84,6 +84,7 @@ def test_linkparse assert(false, "datafile parsing error: bad header #{h}") end } + html = html.force_encoding('UTF-8') if html.respond_to? :force_encoding links = OpenID::parse_link_attrs(html) found = links.dup @@ -97,5 +98,16 @@ def test_linkparse end } assert_equal(numtests, testnum, "Number of tests") + + # test handling of invalid UTF-8 byte sequences + if "".respond_to? :force_encoding + html = "
hello joel\255".force_encoding('UTF-8') + else + html = "hello joel\255" + end + assert_nothing_raised do + OpenID::parse_link_attrs(html) + end + end end