Permalink
Browse files

Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer.
Fixed the string-handling to work in both
Ruby 1.8.x and 1.9.2. There are still,
inexplicably, two functional tests that
fail. But the rest seems to work quite well.
  • Loading branch information...
distler committed Nov 30, 2009
1 parent 79c8572 commit a6429f8c222f11071c7664d6c98514a30cd09d12
Showing with 501 additions and 825 deletions.
  1. +251 −196 attic/lib/sanitize.rb
  2. +140 −138 attic/test/unit/sanitize_test.rb
  3. 0 { → attic}/vendor/plugins/HTML5lib/History.txt
  4. 0 { → attic}/vendor/plugins/HTML5lib/LICENSE
  5. 0 { → attic}/vendor/plugins/HTML5lib/Manifest.txt
  6. 0 { → attic}/vendor/plugins/HTML5lib/README
  7. 0 { → attic}/vendor/plugins/HTML5lib/Rakefile.rb
  8. 0 { → attic}/vendor/plugins/HTML5lib/bin/html5
  9. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5.rb
  10. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/cli.rb
  11. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/constants.rb
  12. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/base.rb
  13. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/inject_meta_charset.rb
  14. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/iso639codes.rb
  15. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/optionaltags.rb
  16. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/rfc2046.rb
  17. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/rfc3987.rb
  18. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/sanitizer.rb
  19. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/validator.rb
  20. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/filters/whitespace.rb
  21. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser.rb
  22. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb
  23. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/after_frameset_phase.rb
  24. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/after_head_phase.rb
  25. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/before_head_phase.rb
  26. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb
  27. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_caption_phase.rb
  28. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_cell_phase.rb
  29. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_column_group_phase.rb
  30. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_frameset_phase.rb
  31. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_head_phase.rb
  32. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_row_phase.rb
  33. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_select_phase.rb
  34. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_body_phase.rb
  35. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_phase.rb
  36. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/initial_phase.rb
  37. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb
  38. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/root_element_phase.rb
  39. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/html5parser/trailing_end_phase.rb
  40. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/inputstream.rb
  41. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/liberalxmlparser.rb
  42. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/sanitizer.rb
  43. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/serializer.rb
  44. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/serializer/htmlserializer.rb
  45. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/serializer/xhtmlserializer.rb
  46. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/sniffer.rb
  47. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb
  48. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treebuilders.rb
  49. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treebuilders/base.rb
  50. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treebuilders/hpricot.rb
  51. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treebuilders/rexml.rb
  52. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treebuilders/simpletree.rb
  53. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treewalkers.rb
  54. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treewalkers/base.rb
  55. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treewalkers/hpricot.rb
  56. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treewalkers/rexml.rb
  57. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/treewalkers/simpletree.rb
  58. 0 { → attic}/vendor/plugins/HTML5lib/lib/html5/version.rb
  59. 0 { → attic}/vendor/plugins/HTML5lib/test/preamble.rb
  60. 0 { → attic}/vendor/plugins/HTML5lib/test/test_cli.rb
  61. 0 { → attic}/vendor/plugins/HTML5lib/test/test_encoding.rb
  62. 0 { → attic}/vendor/plugins/HTML5lib/test/test_input_stream.rb
  63. 0 { → attic}/vendor/plugins/HTML5lib/test/test_lxp.rb
  64. 0 { → attic}/vendor/plugins/HTML5lib/test/test_parser.rb
  65. 0 { → attic}/vendor/plugins/HTML5lib/test/test_sanitizer.rb
  66. 0 { → attic}/vendor/plugins/HTML5lib/test/test_serializer.rb
  67. 0 { → attic}/vendor/plugins/HTML5lib/test/test_sniffer.rb
  68. 0 { → attic}/vendor/plugins/HTML5lib/test/test_stream.rb
  69. 0 { → attic}/vendor/plugins/HTML5lib/test/test_tokenizer.rb
  70. 0 { → attic}/vendor/plugins/HTML5lib/test/test_treewalkers.rb
  71. 0 { → attic}/vendor/plugins/HTML5lib/test/test_validator.rb
  72. 0 { → attic}/vendor/plugins/HTML5lib/test/tokenizer_test_parser.rb
  73. 0 { → attic}/vendor/plugins/HTML5lib/test19.rb
  74. 0 { → attic}/vendor/plugins/HTML5lib/testdata/encoding/chardet/test_big5.txt
  75. 0 { → attic}/vendor/plugins/HTML5lib/testdata/encoding/test-yahoo-jp.dat
  76. 0 { → attic}/vendor/plugins/HTML5lib/testdata/encoding/tests1.dat
  77. 0 { → attic}/vendor/plugins/HTML5lib/testdata/encoding/tests2.dat
  78. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
  79. 0 { → attic}/vendor/plugins/HTML5lib/testdata/serializer/core.test
  80. 0 { → attic}/vendor/plugins/HTML5lib/testdata/serializer/injectmeta.test
  81. 0 { → attic}/vendor/plugins/HTML5lib/testdata/serializer/optionaltags.test
  82. 0 { → attic}/vendor/plugins/HTML5lib/testdata/serializer/options.test
  83. 0 { → attic}/vendor/plugins/HTML5lib/testdata/serializer/whitespace.test
  84. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sites/google-results.htm
  85. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sites/python-ref-import.htm
  86. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sites/web-apps-old.htm
  87. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sites/web-apps.htm
  88. 0 { → attic}/vendor/plugins/HTML5lib/testdata/sniffer/htmlOrFeed.json
  89. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/contentModelFlags.test
  90. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/entities.test
  91. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/escapeFlag.test
  92. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/test1.test
  93. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/test2.test
  94. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/test3.test
  95. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tokenizer/test4.test
  96. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests1.dat
  97. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests2.dat
  98. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests3.dat
  99. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests4.dat
  100. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests5.dat
  101. 0 { → attic}/vendor/plugins/HTML5lib/testdata/tree-construction/tests6.dat
  102. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/attributes.test
  103. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/base-href-attribute.test
  104. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/base-target-attribute.test
  105. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/blockquote-cite-attribute.test
  106. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/classattribute.test
  107. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/contenteditableattribute.test
  108. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/contextmenuattribute.test
  109. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/dirattribute.test
  110. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/draggableattribute.test
  111. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/html-xmlns-attribute.test
  112. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/idattribute.test
  113. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/inputattributes.test
  114. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/irrelevantattribute.test
  115. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/langattribute.test
  116. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/li-value-attribute.test
  117. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/link-href-attribute.test
  118. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/link-hreflang-attribute.test
  119. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/link-rel-attribute.test
  120. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/ol-start-attribute.test
  121. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/starttags.test
  122. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/style-scoped-attribute.test
  123. 0 { → attic}/vendor/plugins/HTML5lib/testdata/validator/tabindexattribute.test
  124. +6 −0 lib/chunks/engines.rb
  125. +3 −3 lib/chunks/nowiki.rb
  126. +0 −262 lib/sanitize.rb
  127. +20 −1 lib/sanitizer.rb
  128. +28 −4 lib/stringsupport.rb
  129. +1 −1 test/functional/application_test.rb
  130. +1 −1 test/functional/file_controller_test.rb
  131. +1 −1 test/functional/routes_test.rb
  132. +4 −3 test/functional/wiki_controller_test.rb
  133. +1 −1 test/unit/chunks/category_test.rb
  134. +5 −5 test/unit/chunks/nowiki_test.rb
  135. +1 −1 test/unit/chunks/wiki_test.rb
  136. +3 −3 test/unit/page_renderer_test.rb
  137. +0 −189 test/unit/sanitize_test.rb
  138. +3 −3 test/unit/sanitizer_test.rb
  139. +1 −1 test/unit/uri_test.rb
  140. +1 −1 test/unit/web_test.rb
  141. +1 −1 test/unit/wiki_file_test.rb
  142. +30 −10 vendor/rails/activesupport/lib/active_support/message_verifier.rb
View

Large diffs are not rendered by default.

Oops, something went wrong.

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -30,6 +30,7 @@ def initialize(content)
class Textile < AbstractEngine
def mask
+ @content.as_utf8
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@@ -39,6 +40,7 @@ def mask
class Markdown < AbstractEngine
def mask
+ @content.as_utf8
# If the request is for S5, call Maruku accordingly (without math)
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@@ -56,6 +58,7 @@ def mask
class MarkdownMML < AbstractEngine
def mask
+ @content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@@ -77,6 +80,7 @@ def mask
class MarkdownPNG < AbstractEngine
def mask
+ @content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@@ -108,6 +112,7 @@ def mask
class Mixed < AbstractEngine
def mask
+ @content.as_utf8
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@@ -117,6 +122,7 @@ def mask
class RDoc < AbstractEngine
def mask
+ @content.as_utf8
html = RDocSupport::RDocFormatter.new(@content).to_html
end
end
View
@@ -1,5 +1,5 @@
require 'chunks/chunk'
-require 'sanitize'
+require 'sanitizer'
# This chunks allows certain parts of a wiki page to be hidden from the
# rest of the rendering pipeline. It should be run at the beginning
@@ -17,7 +17,7 @@
class NoWiki < Chunk::Abstract
- include Sanitize
+ include Sanitizer
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
def self.pattern() NOWIKI_PATTERN end
@@ -26,7 +26,7 @@ def self.pattern() NOWIKI_PATTERN end
def initialize(match_data, content)
super
- @plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
+ @plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
end
end
View
@@ -1,262 +0,0 @@
-# == Introduction
-#
-# This module provides sanitization of XHTML+MathML+SVG
-# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
-#
-# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
-# resemble that of browsers.
-#
-# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
-# sanitize_html() is a case-insensitive sanitizer suitable for HTML
-# sanitize_rexml() sanitizes a REXML tree, returning a string
-# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
-# by running the output of sanitize_xhtml() through REXML
-#
-# == Files
-#
-# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
-# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
-#
-# == Author
-#
-# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
-#
-# == License
-#
-# Ruby License
-
-module Sanitize
-
- require 'html5/html5parser'
- require 'html5/liberalxmlparser'
- require 'html5/treewalkers'
- require 'html5/treebuilders'
- require 'html5/serializer'
- require 'html5/sanitizer'
- require 'stringsupport.rb'
-
- include HTML5
-
-# Sanitize a string, parsed using XHTML parsing rules.
-#
-# :call-seq:
-# sanitize_xhtml(string) -> string
-# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-# By default, the output is a string. But, optionally, you can return a REXML tree.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
- def sanitize_xhtml(html, options = {})
- @encoding = 'utf-8'
- @treebuilder = TreeBuilders::REXML::TreeBuilder
- @to_tree = false
- options.each do |name, value|
- next unless %w(encoding treebuilder to_tree).include? name.to_s
- if name.to_s == 'treebuilder'
- @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
- else
- instance_variable_set("@#{name}", value)
- end
- end
- if @encoding == 'utf-8'
- parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
- :lowercase_element_name => false, :lowercase_attr_name => false,
- :encoding => @encoding, :tree => @treebuilder })
- else
- parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
- :lowercase_element_name => false, :lowercase_attr_name => false,
- :encoding => @encoding, :tree => @treebuilder })
- end
- return parsed if @to_tree
- return parsed.to_s
- end
-
-# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
-# ensure well-formedness.
-#
-# :call-seq:
-# safe_sanitize_xhtml(string) -> string
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
- def safe_sanitize_xhtml(html, options = {})
- options[:to_tree] = false
- sanitized = sanitize_xhtml(html, options)
- doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
- sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
- rescue REXML::ParseException
- sanitized = sanitized.escapeHTML
- end
-
-# Sanitize a string, parsed using HTML parsing rules.
-#
-# :call-seq:
-# sanitize_html( string ) -> string
-# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-# By default, the output is a string. But, optionally, you can return a REXML tree.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
- def sanitize_html(html, options = {})
- @encoding = 'utf-8'
- @treebuilder = TreeBuilders::REXML::TreeBuilder
- @to_tree = false
- options.each do |name, value|
- next unless %w(encoding treebuilder to_tree).include? name.to_s
- if name.to_s == 'treebuilder'
- @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
- else
- instance_variable_set("@#{name}", value)
- end
- end
- if @encoding == 'utf-8'
- parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
- :encoding => @encoding, :tree => @treebuilder })
- else
- parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
- :encoding => @encoding, :tree => @treebuilder })
- end
- return parsed if @to_tree
- return parsed.to_s
- end
-
-# Sanitize a REXML tree. The output is a string.
-#
-# :call-seq:
-# sanitize_rexml(tree) -> string
-#
- def sanitize_rexml(tree)
- tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
- XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
- :space_before_trailing_solidus => true,
- :inject_meta_charset => false,
- :sanitize => true})
- end
-end
-
-require 'rexml/element'
-module REXML #:nodoc:
- class Element
-
-# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
-#
-# :call-seq:
-# tree.to_ncr -> REXML::Element
-#
-# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
-# access the resulting REXML document.
-#
-# Note that this method needs to traverse the entire tree, converting text nodes and attributes
-# for each element. This can be SLOW. It will often be faster to serialize to a string and then
-# use String.to_ncr instead.
-#
- def to_ncr
- self.each_element { |el|
- el.texts.each_index {|i|
- el.texts[i].value = el.texts[i].to_s.to_ncr
- }
- el.attributes.each { |name,val|
- el.attributes[name] = val.to_ncr
- }
- el.to_ncr if el.has_elements?
- }
- return self
- end
-
-# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
-#
-# :call-seq:
-# tree.to_utf8 -> REXML::Element
-#
-# Note that this method needs to traverse the entire tree, converting text nodes and attributes
-# for each element. This can be SLOW. It will often be faster to serialize to a string and then
-# use String.to_utf8 instead.
-#
- def to_utf8
- self.each_element { |el|
- el.texts.each_index {|i|
- el.texts[i].value = el.texts[i].to_s.to_utf8
- }
- el.attributes.each { |name,val|
- el.attributes[name] = val.to_utf8
- }
- el.to_utf8 if el.has_elements?
- }
- return self
- end
-
- end
-end
-
-module HTML5 #:nodoc: all
- module TreeWalkers
-
- private
-
- class << self
- def [](name)
- case name.to_s.downcase
- when 'rexml'
- require 'html5/treewalkers/rexml'
- REXML::TreeWalker
- when 'rexml2'
- REXML2::TreeWalker
- else
- raise "Unknown TreeWalker #{name}"
- end
- end
-
- alias :get_tree_walker :[]
- end
-
- module REXML2
- class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
-
- private
-
- def node_details(node)
- case node
- when ::REXML::Document
- [:DOCUMENT]
- when ::REXML::Element
- if !node.name
- [:DOCUMENT_FRAGMENT]
- else
- [:ELEMENT, node.name,
- node.attributes.map {|name,value| [name,value.to_utf8]},
- node.has_elements? || node.has_text?]
- end
- when ::REXML::Text
- [:TEXT, node.value.to_utf8]
- when ::REXML::Comment
- [:COMMENT, node.string]
- when ::REXML::DocType
- [:DOCTYPE, node.name, node.public, node.system]
- when ::REXML::XMLDecl
- [nil]
- else
- [:UNKNOWN, node.class.inspect]
- end
- end
-
- def first_child(node)
- node.children.first
- end
-
- def next_sibling(node)
- node.next_sibling
- end
-
- def parent(node)
- node.parent
- end
- end
- end
- end
-end
View
@@ -169,7 +169,7 @@ def process_attributes_for(node)
node.attributes.delete attr; next
end
if ATTR_VAL_IS_URI.include?(attr)
- val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
+ val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr; next
end
@@ -206,4 +206,23 @@ def sanitize_css(style)
clean.join(' ')
end
+
+# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
+# ensure well-formedness.
+#
+# :call-seq:
+# safe_sanitize_xhtml(string) -> string
+#
+# Unless otherwise specified, the string is assumed to be utf-8 encoded.
+#
+# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
+# (REXML trees are always utf-8 encoded.)
+ def safe_xhtml_sanitize(html, options = {})
+ sanitized = xhtml_sanitize(html.purify)
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
+ sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
+ rescue REXML::ParseException
+ sanitized = sanitized.escapeHTML
+ end
+
end
Oops, something went wrong.

0 comments on commit a6429f8

Please sign in to comment.