From 0f46921a97677b83b106366c805063105c5e9f20 Mon Sep 17 00:00:00 2001 From: Zaid Zawaideh Date: Mon, 11 Feb 2013 14:17:32 -0500 Subject: [PATCH 1/6] added handling of invalide UTF-8 byte sequence exceptions --- lib/openid/consumer/html_parse.rb | 6 +++++- test/test_linkparse.rb | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/openid/consumer/html_parse.rb b/lib/openid/consumer/html_parse.rb index fca39456..222fc0b9 100644 --- a/lib/openid/consumer/html_parse.rb +++ b/lib/openid/consumer/html_parse.rb @@ -34,7 +34,11 @@ def OpenID.unescape_hash(h) def OpenID.parse_link_attrs(html) - stripped = html.gsub(REMOVED_RE,'') + begin + stripped = html.gsub(REMOVED_RE,'') + rescue ArgumentError + stripped = html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(REMOVED_RE,'') + end parser = HTMLTokenizer.new(stripped) links = [] diff --git a/test/test_linkparse.rb b/test/test_linkparse.rb index 6360d507..ef19a9c2 100644 --- a/test/test_linkparse.rb +++ b/test/test_linkparse.rb @@ -84,7 +84,8 @@ def test_linkparse assert(false, "datafile parsing error: bad header #{h}") end } - links = OpenID::parse_link_attrs(html) + + links = OpenID::parse_link_attrs(html.force_encoding('UTF-8')) found = links.dup expected = expected_links.dup @@ -97,5 +98,12 @@ def test_linkparse end } assert_equal(numtests, testnum, "Number of tests") + + # test handling of invalid UTF-8 byte sequences + html = "hello joel\255".force_encoding("UTF-8") + assert_nothing_raised do + OpenID::parse_link_attrs(html) + end + end end From a647c12316e859dfbf2a10eb812f3d1d585baddb Mon Sep 17 00:00:00 2001 From: Zaid Date: Tue, 12 Feb 2013 14:06:58 -0500 Subject: [PATCH 2/6] Update to use 1.8 style hash --- lib/openid/consumer/html_parse.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openid/consumer/html_parse.rb b/lib/openid/consumer/html_parse.rb index 222fc0b9..559cd4fe 100644 --- a/lib/openid/consumer/html_parse.rb +++ b/lib/openid/consumer/html_parse.rb @@ -37,7 +37,7 @@ def OpenID.parse_link_attrs(html) begin stripped = html.gsub(REMOVED_RE,'') rescue ArgumentError - stripped = html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(REMOVED_RE,'') + stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') end parser = HTMLTokenizer.new(stripped) From abdcf65e1e7c6cc58aded36c86db866384ae639b Mon Sep 17 00:00:00 2001 From: Zaid Date: Tue, 12 Feb 2013 14:34:48 -0500 Subject: [PATCH 3/6] fix problem in force_encoding in tests force_encoding doesn't exist in ruby 1.8. Pass string as is in 1.8 and only force_encoding if string responds to it --- test/test_linkparse.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_linkparse.rb b/test/test_linkparse.rb index ef19a9c2..dc9d394a 100644 --- a/test/test_linkparse.rb +++ b/test/test_linkparse.rb @@ -84,8 +84,8 @@ def test_linkparse assert(false, "datafile parsing error: bad header #{h}") end } - - links = OpenID::parse_link_attrs(html.force_encoding('UTF-8')) + html = html.force_encoding('UTF-8') if html.respond_to? :force_encoding + links = OpenID::parse_link_attrs(html) found = links.dup expected = expected_links.dup @@ -100,7 +100,8 @@ def test_linkparse assert_equal(numtests, testnum, "Number of tests") # test handling of invalid UTF-8 byte sequences - html = "hello joel\255".force_encoding("UTF-8") + html = "hello joel\255" + html = html.force_encoding('UTF-8') if html.respond_to? :force_encoding assert_nothing_raised do OpenID::parse_link_attrs(html) end From 542cac428d93aed3101677a591334650b8db1f4e Mon Sep 17 00:00:00 2001 From: Zaid Zawaideh Date: Tue, 12 Feb 2013 14:54:58 -0500 Subject: [PATCH 4/6] catch Encoding::UndefinedConversionError for compatibility with JRuby 1.9 mode --- lib/openid/consumer/html_parse.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openid/consumer/html_parse.rb b/lib/openid/consumer/html_parse.rb index 559cd4fe..2e12bb6c 100644 --- a/lib/openid/consumer/html_parse.rb +++ b/lib/openid/consumer/html_parse.rb @@ -36,7 +36,7 @@ def OpenID.unescape_hash(h) def OpenID.parse_link_attrs(html) begin stripped = html.gsub(REMOVED_RE,'') - rescue ArgumentError + rescue ArgumentError, Encoding::UndefinedConversionError stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') end parser = HTMLTokenizer.new(stripped) From b1d0c38fe8dd6d64b58ce417ac20e76900807781 Mon Sep 17 00:00:00 2001 From: Zaid Zawaideh Date: Tue, 12 Feb 2013 15:23:36 -0500 Subject: [PATCH 5/6] jruby 1.9 mode still complaining about string encoding, try forcing it immediately --- test/test_linkparse.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_linkparse.rb b/test/test_linkparse.rb index dc9d394a..9a504290 100644 --- a/test/test_linkparse.rb +++ b/test/test_linkparse.rb @@ -100,8 +100,11 @@ def test_linkparse assert_equal(numtests, testnum, "Number of tests") # test handling of invalid UTF-8 byte sequences - html = "hello joel\255" - html = html.force_encoding('UTF-8') if html.respond_to? :force_encoding + if "".respond_to? :force_encoding + html = "hello joel\255".force_encoding('UTF-8') + else + html = "hello joel\255" + end assert_nothing_raised do OpenID::parse_link_attrs(html) end From d3dca2faa653695cdaf2823ee6b9c4622a83ece3 Mon Sep 17 00:00:00 2001 From: Zaid Zawaideh Date: Tue, 12 Feb 2013 23:09:04 -0500 Subject: [PATCH 6/6] fixed issue with jruby in 1.9 mode not handling string encoding from binary properly. Now falling back to using ASCII as source --- lib/openid/consumer/html_parse.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/openid/consumer/html_parse.rb b/lib/openid/consumer/html_parse.rb index 2e12bb6c..e127dbef 100644 --- a/lib/openid/consumer/html_parse.rb +++ b/lib/openid/consumer/html_parse.rb @@ -36,8 +36,12 @@ def OpenID.unescape_hash(h) def OpenID.parse_link_attrs(html) begin stripped = html.gsub(REMOVED_RE,'') - rescue ArgumentError, Encoding::UndefinedConversionError - stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') + rescue ArgumentError + begin + stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') + rescue Encoding::UndefinedConversionError #needed for a problem in JRuby where it can't handle the conversion + stripped = html.encode('UTF-8', 'ASCII', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'') + end end parser = HTMLTokenizer.new(stripped)