diff --git a/lib/htmlpurifier/HTMLPurifier.php b/lib/htmlpurifier/HTMLPurifier.php index 914ba25ae6d63..e599e1c0c9b56 100644 --- a/lib/htmlpurifier/HTMLPurifier.php +++ b/lib/htmlpurifier/HTMLPurifier.php @@ -19,7 +19,7 @@ */ /* - HTML Purifier 4.3.0 - Standards Compliant HTML Filtering + HTML Purifier 4.4.0 - Standards Compliant HTML Filtering Copyright (C) 2006-2008 Edward Z. Yang This library is free software; you can redistribute it and/or @@ -55,10 +55,10 @@ class HTMLPurifier { /** Version of HTML Purifier */ - public $version = '4.3.0'; + public $version = '4.4.0'; /** Constant with version of HTML Purifier */ - const VERSION = '4.3.0'; + const VERSION = '4.4.0'; /** Global configuration object */ public $config; diff --git a/lib/htmlpurifier/HTMLPurifier.safe-includes.php b/lib/htmlpurifier/HTMLPurifier.safe-includes.php index a5c0d5bb80cd7..d49b196c429ea 100644 --- a/lib/htmlpurifier/HTMLPurifier.safe-includes.php +++ b/lib/htmlpurifier/HTMLPurifier.safe-includes.php @@ -67,6 +67,7 @@ require_once $__dir . '/HTMLPurifier/VarParser.php'; require_once $__dir . '/HTMLPurifier/VarParserException.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS.php'; +require_once $__dir . '/HTMLPurifier/AttrDef/Clone.php'; require_once $__dir . '/HTMLPurifier/AttrDef/Enum.php'; require_once $__dir . '/HTMLPurifier/AttrDef/Integer.php'; require_once $__dir . '/HTMLPurifier/AttrDef/Lang.php'; @@ -84,6 +85,7 @@ require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Filter.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Font.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/FontFamily.php'; +require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Ident.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Length.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ListStyle.php'; @@ -124,10 +126,12 @@ require_once $__dir . '/HTMLPurifier/AttrTransform/SafeObject.php'; require_once $__dir . '/HTMLPurifier/AttrTransform/SafeParam.php'; require_once $__dir . '/HTMLPurifier/AttrTransform/ScriptRequired.php'; +require_once $__dir . '/HTMLPurifier/AttrTransform/TargetBlank.php'; require_once $__dir . '/HTMLPurifier/AttrTransform/Textarea.php'; require_once $__dir . '/HTMLPurifier/ChildDef/Chameleon.php'; require_once $__dir . '/HTMLPurifier/ChildDef/Custom.php'; require_once $__dir . '/HTMLPurifier/ChildDef/Empty.php'; +require_once $__dir . '/HTMLPurifier/ChildDef/List.php'; require_once $__dir . '/HTMLPurifier/ChildDef/Required.php'; require_once $__dir . '/HTMLPurifier/ChildDef/Optional.php'; require_once $__dir . '/HTMLPurifier/ChildDef/StrictBlockquote.php'; @@ -142,6 +146,7 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/Edit.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Forms.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Hypertext.php'; +require_once $__dir . '/HTMLPurifier/HTMLModule/Iframe.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Image.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Legacy.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/List.php'; @@ -158,6 +163,7 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/StyleAttribute.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Tables.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Target.php'; +require_once $__dir . '/HTMLPurifier/HTMLModule/TargetBlank.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Text.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy.php'; require_once $__dir . '/HTMLPurifier/HTMLModule/XMLCommonAttributes.php'; @@ -196,6 +202,7 @@ require_once $__dir . '/HTMLPurifier/URIFilter/HostBlacklist.php'; require_once $__dir . '/HTMLPurifier/URIFilter/MakeAbsolute.php'; require_once $__dir . '/HTMLPurifier/URIFilter/Munge.php'; +require_once $__dir . '/HTMLPurifier/URIFilter/SafeIframe.php'; require_once $__dir . '/HTMLPurifier/URIScheme/data.php'; require_once $__dir . '/HTMLPurifier/URIScheme/file.php'; require_once $__dir . '/HTMLPurifier/URIScheme/ftp.php'; diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Ident.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Ident.php new file mode 100644 index 0000000000000..779794a0b3d27 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Ident.php @@ -0,0 +1,24 @@ +clone = $clone; + } + + public function validate($v, $config, $context) { + return $this->clone->validate($v, $config, $context); + } + + public function make($string) { + return clone $this->clone; + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php index d01e20454ea9f..00d865723b399 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php @@ -14,7 +14,7 @@ public function validate($string, $config, $context) { $string = trim($string); if (empty($string)) return false; - if (isset($colors[$string])) return $colors[$string]; + if (isset($colors[strtolower($string)])) return $colors[$string]; if ($string[0] === '#') $hex = substr($string, 1); else $hex = $string; diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php index 81d03762dea58..0015fa1ebb641 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php @@ -12,12 +12,22 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef { - // ref functionality disabled, since we also have to verify - // whether or not the ID it refers to exists + // selector is NOT a valid thing to use for IDREFs, because IDREFs + // *must* target IDs that exist, whereas selector #ids do not. + + /** + * Determines whether or not we're validating an ID in a CSS + * selector context. + */ + protected $selector; + + public function __construct($selector = false) { + $this->selector = $selector; + } public function validate($id, $config, $context) { - if (!$config->get('Attr.EnableID')) return false; + if (!$this->selector && !$config->get('Attr.EnableID')) return false; $id = trim($id); // trim it first @@ -33,10 +43,10 @@ public function validate($id, $config, $context) { '%Attr.IDPrefix is set', E_USER_WARNING); } - //if (!$this->ref) { + if (!$this->selector) { $id_accumulator =& $context->get('IDAccumulator'); if (isset($id_accumulator->ids[$id])) return false; - //} + } // we purposely avoid using regex, hopefully this is faster @@ -56,7 +66,7 @@ public function validate($id, $config, $context) { return false; } - if (/*!$this->ref && */$result) $id_accumulator->add($id); + if (!$this->selector && $result) $id_accumulator->add($id); // if no change was made to the ID, return the result // else, return the new id if stripping whitespace made it diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php index 01a6d83e9526f..c2b6846712ba3 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php @@ -19,7 +19,7 @@ public function __construct($embeds_resource = false) { } public function make($string) { - $embeds = (bool) $string; + $embeds = ($string === 'embedded'); return new HTMLPurifier_AttrDef_URI($embeds); } diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php index feca469d7030d..125decb2df4b1 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php @@ -44,9 +44,8 @@ public function validate($string, $config, $context) { // A regular domain name. - // This breaks I18N domain names, but we don't have proper IRI support, - // so force users to insert Punycode. If there's complaining we'll - // try to fix things into an international friendly form. + // This doesn't match I18N domain names, but we don't have proper IRI support, + // so force users to insert Punycode. // The productions describing this are: $a = '[a-z]'; // alpha @@ -57,10 +56,44 @@ public function validate($string, $config, $context) { // toplabel = alpha | alpha *( alphanum | "-" ) alphanum $toplabel = "$a($and*$an)?"; // hostname = *( domainlabel "." ) toplabel [ "." ] - $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string); - if (!$match) return false; + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } + + // If we have Net_IDNA2 support, we can support IRIs by + // punycoding them. (This is the most portable thing to do, + // since otherwise we have to assume browsers support + + if ($config->get('Core.EnableIDNA')) { + $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true)); + // we need to encode each period separately + $parts = explode('.', $string); + try { + $new_parts = array(); + foreach ($parts as $part) { + $encodable = false; + for ($i = 0, $c = strlen($part); $i < $c; $i++) { + if (ord($part[$i]) > 0x7a) { + $encodable = true; + break; + } + } + if (!$encodable) { + $new_parts[] = $part; + } else { + $new_parts[] = $idna->encode($part); + } + } + $string = implode('.', $new_parts); + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } + } catch (Exception $e) { + // XXX error reporting + } + } - return $string; + return false; } } diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php index 573b42c9c5a1b..f7fb1209b3be7 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php @@ -24,9 +24,13 @@ public function transform($attr, $config, $context) { $url = $this->parser->parse($attr['href']); $scheme = $url->getSchemeObj($config, $context); - if (!is_null($url->host) && $scheme !== false && $scheme->browsable) { + if ($scheme->browsable && !$url->isLocal($config, $context)) { if (isset($attr['rel'])) { - $attr['rel'] .= ' nofollow'; + $rels = explode(' ', $attr); + if (!in_array('nofollow', $rels)) { + $rels[] = 'nofollow'; + } + $attr['rel'] = implode(' ', $rels); } else { $attr['rel'] = 'nofollow'; } diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/TargetBlank.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/TargetBlank.php new file mode 100644 index 0000000000000..deba8b40fdf48 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/TargetBlank.php @@ -0,0 +1,38 @@ +parser = new HTMLPurifier_URIParser(); + } + + public function transform($attr, $config, $context) { + + if (!isset($attr['href'])) { + return $attr; + } + + // XXX Kind of inefficient + $url = $this->parser->parse($attr['href']); + $scheme = $url->getSchemeObj($config, $context); + + if ($scheme->browsable && !$url->isBenign($config, $context)) { + $attr['target'] = '_blank'; + } + + return $attr; + + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php index fc2ea4e5881bf..6f985ff934750 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php @@ -15,6 +15,13 @@ class HTMLPurifier_AttrTypes * types. */ public function __construct() { + // XXX This is kind of poor, since we don't actually /clone/ + // instances; instead, we use the supplied make() attribute. So, + // the underlying class must know how to deal with arguments. + // With the old implementation of Enum, that ignored its + // arguments when handling a make dispatch, the IAlign + // definition wouldn't work. + // pseudo-types, must be instantiated via shorthand $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum(); $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool(); @@ -29,6 +36,9 @@ public function __construct() { $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang(); $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color(); + $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right'); + $this->info['LAlign'] = self::makeEnum('top,bottom,left,right'); + $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget(); // unimplemented aliases $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text(); @@ -44,6 +54,10 @@ public function __construct() { $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); } + private static function makeEnum($in) { + return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in))); + } + /** * Retrieves a type * @param $type String type name diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/List.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/List.php new file mode 100644 index 0000000000000..cdaa2893a7e2d --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/List.php @@ -0,0 +1,120 @@ + true, 'ul' => true, 'ol' => true); + public function validateChildren($tokens_of_children, $config, $context) { + // Flag for subclasses + $this->whitespace = false; + + // if there are no tokens, delete parent node + if (empty($tokens_of_children)) return false; + + // the new set of children + $result = array(); + + // current depth into the nest + $nesting = 0; + + // a little sanity check to make sure it's not ALL whitespace + $all_whitespace = true; + + $seen_li = false; + $need_close_li = false; + + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) { + $result[] = $token; + continue; + } + $all_whitespace = false; // phew, we're not talking about whitespace + + if ($nesting == 1 && $need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + $nesting--; + $need_close_li = false; + } + + $is_child = ($nesting == 0); + + if ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + if ($is_child) { + if ($token->name === 'li') { + // good + $seen_li = true; + } elseif ($token->name === 'ul' || $token->name === 'ol') { + // we want to tuck this into the previous li + $need_close_li = true; + $nesting++; + if (!$seen_li) { + // create a new li element + $result[] = new HTMLPurifier_Token_Start('li'); + } else { + // backtrack until found + while(true) { + $t = array_pop($result); + if ($t instanceof HTMLPurifier_Token_End) { + // XXX actually, these invariants could very plausibly be violated + // if we are doing silly things with modifying the set of allowed elements. + // FORTUNATELY, it doesn't make a difference, since the allowed + // elements are hard-coded here! + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + break; + } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + // XXX this should have a helper for it... + $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor); + break; + } else { + if (!$t->is_whitespace) { + trigger_error("Only whitespace present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + } + } + } + } else { + // start wrapping (this doesn't precisely mimic + // browser behavior, but what browsers do is kind of + // hard to mimic in a standards compliant way + // XXX Actually, this has no impact in practice, + // because this gets handled earlier. Arguably, + // we should rip out all of that processing + $result[] = new HTMLPurifier_Token_Start('li'); + $nesting++; + $seen_li = true; + $need_close_li = true; + } + } + $result[] = $token; + } + if ($need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + } + if (empty($result)) return false; + if ($all_whitespace) { + return false; + } + if ($tokens_of_children == $result) return true; + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php index 34f0227dd2cc5..9a93421a1a3e2 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php @@ -1,7 +1,33 @@ s with a . foreach ($tokens_of_children as $token) { $is_child = ($nesting == 0); @@ -51,8 +79,9 @@ public function validateChildren($tokens_of_children, $config, $context) { // okay, let's stash the tokens away // first token tells us the type of the collection switch ($collection[$tag_index]->name) { - case 'tr': case 'tbody': + $tbody_mode = true; + case 'tr': $content[] = $collection; break; case 'caption': @@ -61,13 +90,28 @@ public function validateChildren($tokens_of_children, $config, $context) { break; case 'thead': case 'tfoot': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a to + // the top. Ever. (Our scheme will float the + // first to the top.) So maybe + // s that are not first should be + // turned into ? Very tricky, indeed. + // access the appropriate variable, $thead or $tfoot $var = $collection[$tag_index]->name; if ($$var === false) { $$var = $collection; } else { - // transmutate the first and less entries into - // tbody tags, and then put into content + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. $collection[$tag_index]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody'; $content[] = $collection; @@ -126,7 +170,48 @@ public function validateChildren($tokens_of_children, $config, $context) { if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($thead !== false) $ret = array_merge($ret, $thead); if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + if ($tbody_mode) { + // a little tricky, since the start of the collection may be + // whitespace + $inside_tbody = false; + foreach ($content as $token_array) { + // find the starting token + foreach ($token_array as $t) { + if ($t->name === 'tr' || $t->name === 'tbody') { + break; + } + } // iterator variable carries over + if ($t->name === 'tr') { + if ($inside_tbody) { + $ret = array_merge($ret, $token_array); + } else { + $ret[] = new HTMLPurifier_Token_Start('tbody'); + $ret = array_merge($ret, $token_array); + $inside_tbody = true; + } + } elseif ($t->name === 'tbody') { + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + $inside_tbody = false; + $ret = array_merge($ret, $token_array); + } else { + $ret = array_merge($ret, $token_array); + } + } else { + trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR); + } + } + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + } + } else { + foreach ($content as $token_array) { + // invariant: everything in here is s + $ret = array_merge($ret, $token_array); + } + } + if (!empty($collection) && $is_collecting == false){ // grab the trailing space $ret = array_merge($ret, $collection); diff --git a/lib/htmlpurifier/HTMLPurifier/Config.php b/lib/htmlpurifier/HTMLPurifier/Config.php index b6551398f8440..554980f2225a0 100644 --- a/lib/htmlpurifier/HTMLPurifier/Config.php +++ b/lib/htmlpurifier/HTMLPurifier/Config.php @@ -20,7 +20,7 @@ class HTMLPurifier_Config /** * HTML Purifier's version */ - public $version = '4.3.0'; + public $version = '4.4.0'; /** * Bool indicator whether or not to automatically finalize @@ -44,7 +44,7 @@ class HTMLPurifier_Config /** * Parser for variables */ - protected $parser; + protected $parser = null; /** * Reference HTMLPurifier_ConfigSchema for value checking @@ -668,7 +668,7 @@ public function autoFinalize() { */ public function finalize() { $this->finalized = true; - unset($this->parser); + $this->parser = null; } /** diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser index 245ba5d2d09c7..b106bcf798a89 100644 Binary files a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser and b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser differ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt index 08b381d34c1ae..c572c14ec1779 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt @@ -24,5 +24,6 @@ array ( --DESCRIPTION-- Lookup array of color names to six digit hexadecimal number corresponding -to color, with preceding hash mark. Used when parsing colors. +to color, with preceding hash mark. Used when parsing colors. The lookup +is done in a case-insensitive manner. --# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt new file mode 100644 index 0000000000000..ce243c35dc051 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt @@ -0,0 +1,9 @@ +Core.EnableIDNA +TYPE: bool +DEFAULT: false +VERSION: 4.4.0 +--DESCRIPTION-- +Allows international domain names in URLs. This configuration option +requires the PEAR Net_IDNA2 module to be installed. It operates by +punycoding any internationalized host names for maximum portability. +--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt new file mode 100644 index 0000000000000..140e21423e591 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt @@ -0,0 +1,10 @@ +HTML.AllowedComments +TYPE: lookup +VERSION: 4.4.0 +DEFAULT: array() +--DESCRIPTION-- +A whitelist which indicates what explicit comment bodies should be +allowed, modulo leading and trailing whitespace. See also %HTML.AllowedCommentsRegexp +(these directives are union'ed together, so a comment is considered +valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt new file mode 100644 index 0000000000000..f22e977d4384b --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt @@ -0,0 +1,15 @@ +HTML.AllowedCommentsRegexp +TYPE: string/null +VERSION: 4.4.0 +DEFAULT: NULL +--DESCRIPTION-- +A regexp, which if it matches the body of a comment, indicates that +it should be allowed. Trailing and leading spaces are removed prior +to running this regular expression. +Warning: Make sure you specify +correct anchor metacharacters ^regex$, otherwise you may accept +comments that you did not mean to! In particular, the regex /foo|bar/ +is probably not sufficiently strict, since it also allows foobar. +See also %HTML.AllowedComments (these directives are union'ed together, +so a comment is considered valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt new file mode 100644 index 0000000000000..5eb6ec2b5a001 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt @@ -0,0 +1,13 @@ +HTML.SafeIframe +TYPE: bool +VERSION: 4.4.0 +DEFAULT: false +--DESCRIPTION-- +

+ Whether or not to permit iframe tags in untrusted documents. This + directive must be accompanied by a whitelist of permitted iframes, + such as %URI.SafeIframeRegexp, otherwise it will fatally error. + This directive has no effect on strict doctypes, as iframes are not + valid. +

+--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt new file mode 100644 index 0000000000000..587a16778b252 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt @@ -0,0 +1,8 @@ +HTML.TargetBlank +TYPE: bool +VERSION: 4.4.0 +DEFAULT: FALSE +--DESCRIPTION-- +If enabled, target=blank attributes are added to all outgoing links. +(This includes links from an HTTPS version of a page to an HTTP version.) +--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt new file mode 100644 index 0000000000000..79084832be397 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt @@ -0,0 +1,22 @@ +URI.SafeIframeRegexp +TYPE: string/null +VERSION: 4.4.0 +DEFAULT: NULL +--DESCRIPTION-- +

+ A PCRE regular expression that will be matched against an iframe URI. This is + a relatively inflexible scheme, but works well enough for the most common + use-case of iframes: embedded video. This directive only has an effect if + %HTML.SafeIframe is enabled. Here are some example values: +

+ +

+ Note that this directive does not give you enough granularity to, say, disable + all autoplay videos. Pipe up on the HTML Purifier forums if this + is a capability you want. +

+--# vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/Encoder.php b/lib/htmlpurifier/HTMLPurifier/Encoder.php index 2b3140caaf575..9fa76bd18c84b 100644 --- a/lib/htmlpurifier/HTMLPurifier/Encoder.php +++ b/lib/htmlpurifier/HTMLPurifier/Encoder.php @@ -19,6 +19,68 @@ private function __construct() { */ public static function muteErrorHandler() {} + /** + * iconv wrapper which mutes errors, but doesn't work around bugs. + */ + public static function unsafeIconv($in, $out, $text) { + set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + $r = iconv($in, $out, $text); + restore_error_handler(); + return $r; + } + + /** + * iconv wrapper which mutes errors and works around bugs. + */ + public static function iconv($in, $out, $text, $max_chunk_size = 8000) { + $code = self::testIconvTruncateBug(); + if ($code == self::ICONV_OK) { + return self::unsafeIconv($in, $out, $text); + } elseif ($code == self::ICONV_TRUNCATES) { + // we can only work around this if the input character set + // is utf-8 + if ($in == 'utf-8') { + if ($max_chunk_size < 4) { + trigger_error('max_chunk_size is too small', E_USER_WARNING); + return false; + } + // split into 8000 byte chunks, but be careful to handle + // multibyte boundaries properly + if (($c = strlen($text)) <= $max_chunk_size) { + return self::unsafeIconv($in, $out, $text); + } + $r = ''; + $i = 0; + while (true) { + if ($i + $max_chunk_size >= $c) { + $r .= self::unsafeIconv($in, $out, substr($text, $i)); + break; + } + // wibble the boundary + if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) { + $chunk_size = $max_chunk_size; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) { + $chunk_size = $max_chunk_size - 1; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) { + $chunk_size = $max_chunk_size - 2; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) { + $chunk_size = $max_chunk_size - 3; + } else { + return false; // rather confusing UTF-8... + } + $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths + $r .= self::unsafeIconv($in, $out, $chunk); + $i += $chunk_size; + } + return $r; + } else { + return false; + } + } else { + return false; + } + } + /** * Cleans a UTF-8 string for well-formedness and SGML validity * @@ -260,6 +322,14 @@ public static function unichr($code) { return $ret; } + public static function iconvAvailable() { + static $iconv = null; + if ($iconv === null) { + $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE; + } + return $iconv; + } + /** * Converts a string to UTF-8 based on configuration. */ @@ -267,25 +337,22 @@ public static function convertToUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); if ($encoding === 'utf-8') return $str; static $iconv = null; - if ($iconv === null) $iconv = function_exists('iconv'); - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + if ($iconv === null) $iconv = self::iconvAvailable(); if ($iconv && !$config->get('Test.ForceNoIconv')) { - $str = iconv($encoding, 'utf-8//IGNORE', $str); + // unaffected by bugs, since UTF-8 support all characters + $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str); if ($str === false) { // $encoding is not a valid encoding - restore_error_handler(); trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); return ''; } // If the string is bjorked by Shift_JIS or a similar encoding // that doesn't support all of ASCII, convert the naughty // characters to their true byte-wise ASCII/UTF-8 equivalents. - $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding)); - restore_error_handler(); + $str = strtr($str, self::testEncodingSupportsASCII($encoding)); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_encode($str); - restore_error_handler(); return $str; } trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); @@ -298,16 +365,15 @@ public static function convertToUTF8($str, $config, $context) { */ public static function convertFromUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); - if ($encoding === 'utf-8') return $str; - static $iconv = null; - if ($iconv === null) $iconv = function_exists('iconv'); if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { - $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); + $str = self::convertToASCIIDumbLossless($str); } - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + if ($encoding === 'utf-8') return $str; + static $iconv = null; + if ($iconv === null) $iconv = self::iconvAvailable(); if ($iconv && !$config->get('Test.ForceNoIconv')) { // Undo our previous fix in convertToUTF8, otherwise iconv will barf - $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); + $ascii_fix = self::testEncodingSupportsASCII($encoding); if (!$escape && !empty($ascii_fix)) { $clear_fix = array(); foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = ''; @@ -315,15 +381,17 @@ public static function convertFromUTF8($str, $config, $context) { } $str = strtr($str, array_flip($ascii_fix)); // Normal stuff - $str = iconv('utf-8', $encoding . '//IGNORE', $str); - restore_error_handler(); + $str = self::iconv('utf-8', $encoding . '//IGNORE', $str); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_decode($str); - restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); + // You might be tempted to assume that the ASCII representation + // might be OK, however, this is *not* universally true over all + // encodings. So we take the conservative route here, rather + // than forcibly turn on %Core.EscapeNonASCIICharacters } /** @@ -373,6 +441,49 @@ public static function convertToASCIIDumbLossless($str) { return $result; } + /** No bugs detected in iconv. */ + const ICONV_OK = 0; + + /** Iconv truncates output if converting from UTF-8 to another + * character set with //IGNORE, and a non-encodable character is found */ + const ICONV_TRUNCATES = 1; + + /** Iconv does not support //IGNORE, making it unusable for + * transcoding purposes */ + const ICONV_UNUSABLE = 2; + + /** + * glibc iconv has a known bug where it doesn't handle the magic + * //IGNORE stanza correctly. In particular, rather than ignore + * characters, it will return an EILSEQ after consuming some number + * of characters, and expect you to restart iconv as if it were + * an E2BIG. Old versions of PHP did not respect the errno, and + * returned the fragment, so as a result you would see iconv + * mysteriously truncating output. We can work around this by + * manually chopping our input into segments of about 8000 + * characters, as long as PHP ignores the error code. If PHP starts + * paying attention to the error code, iconv becomes unusable. + * + * @returns Error code indicating severity of bug. + */ + public static function testIconvTruncateBug() { + static $code = null; + if ($code === null) { + // better not use iconv, otherwise infinite loop! + $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000)); + if ($r === false) { + $code = self::ICONV_UNUSABLE; + } elseif (($c = strlen($r)) < 9000) { + $code = self::ICONV_TRUNCATES; + } elseif ($c > 9000) { + trigger_error('Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()', E_USER_ERROR); + } else { + $code = self::ICONV_OK; + } + } + return $code; + } + /** * This expensive function tests whether or not a given character * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will @@ -385,6 +496,11 @@ public static function convertToASCIIDumbLossless($str) { * which can be used to "undo" any overzealous iconv action. */ public static function testEncodingSupportsASCII($encoding, $bypass = false) { + // All calls to iconv here are unsafe, proof by case analysis: + // If ICONV_OK, no difference. + // If ICONV_TRUNCATE, all calls involve one character inputs, + // so bug is not triggered. + // If ICONV_UNUSABLE, this call is irrelevant static $encodings = array(); if (!$bypass) { if (isset($encodings[$encoding])) return $encodings[$encoding]; @@ -398,24 +514,22 @@ public static function testEncodingSupportsASCII($encoding, $bypass = false) { if (strpos($lenc, 'iso-8859-') === 0) return array(); } $ret = array(); - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); - if (iconv('UTF-8', $encoding, 'a') === false) return false; + if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) return false; for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars $c = chr($i); // UTF-8 char - $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion + $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion if ( $r === '' || // This line is needed for iconv implementations that do not // omit characters that do not exist in the target character set - ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c) + ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c) ) { // Reverse engineer: what's the UTF-8 equiv of this byte // sequence? This assumes that there's no variable width // encoding that doesn't support ASCII. - $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c; + $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c; } } - restore_error_handler(); $encodings[$encoding] = $ret; return $ret; } diff --git a/lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php b/lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php index bbf78a6630a6b..320aa4f16e0af 100644 --- a/lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php +++ b/lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php @@ -1,5 +1,11 @@ blocks from input HTML, cleans them up * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks') @@ -21,8 +27,15 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter private $_styleMatches = array(); private $_tidy; + private $_id_attrdef; + private $_class_attrdef; + private $_enum_attrdef; + public function __construct() { $this->_tidy = new csstidy(); + $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); + $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); + $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); } /** @@ -77,27 +90,166 @@ public function cleanCSS($css, $config, $context) { $css = substr($css, 0, -3); } $css = trim($css); + set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); $this->_tidy->parse($css); + restore_error_handler(); $css_definition = $config->getDefinition('CSS'); + $html_definition = $config->getDefinition('HTML'); + $new_css = array(); foreach ($this->_tidy->css as $k => $decls) { // $decls are all CSS declarations inside an @ selector $new_decls = array(); foreach ($decls as $selector => $style) { $selector = trim($selector); if ($selector === '') continue; // should not happen - if ($selector[0] === '+') { - if ($selector !== '' && $selector[0] === '+') continue; - } - if (!empty($scopes)) { - $new_selector = array(); // because multiple ones are possible - $selectors = array_map('trim', explode(',', $selector)); - foreach ($scopes as $s1) { - foreach ($selectors as $s2) { - $new_selector[] = "$s1 $s2"; + // Parse the selector + // Here is the relevant part of the CSS grammar: + // + // ruleset + // : selector [ ',' S* selector ]* '{' ... + // selector + // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? + // combinator + // : '+' S* + // : '>' S* + // simple_selector + // : element_name [ HASH | class | attrib | pseudo ]* + // | [ HASH | class | attrib | pseudo ]+ + // element_name + // : IDENT | '*' + // ; + // class + // : '.' IDENT + // ; + // attrib + // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* + // [ IDENT | STRING ] S* ]? ']' + // ; + // pseudo + // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] + // ; + // + // For reference, here are the relevant tokens: + // + // HASH #{name} + // IDENT {ident} + // INCLUDES == + // DASHMATCH |= + // STRING {string} + // FUNCTION {ident}\( + // + // And the lexical scanner tokens + // + // name {nmchar}+ + // nmchar [_a-z0-9-]|{nonascii}|{escape} + // nonascii [\240-\377] + // escape {unicode}|\\[^\r\n\f0-9a-f] + // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? + // ident -?{nmstart}{nmchar*} + // nmstart [_a-z]|{nonascii}|{escape} + // string {string1}|{string2} + // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' + // + // We'll implement a subset (in order to reduce attack + // surface); in particular: + // + // - No Unicode support + // - No escapes support + // - No string support (by proxy no attrib support) + // - element_name is matched against allowed + // elements (some people might find this + // annoying...) + // - Pseudo-elements one of :first-child, :link, + // :visited, :active, :hover, :focus + + // handle ruleset + $selectors = array_map('trim', explode(',', $selector)); + $new_selectors = array(); + foreach ($selectors as $sel) { + // split on +, > and spaces + $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); + // even indices are chunks, odd indices are + // delimiters + $nsel = null; + $delim = null; // guaranteed to be non-null after + // two loop iterations + for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { + $x = $basic_selectors[$i]; + if ($i % 2) { + // delimiter + if ($x === ' ') { + $delim = ' '; + } else { + $delim = ' ' . $x . ' '; + } + } else { + // simple selector + $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); + $sdelim = null; + $nx = null; + for ($j = 0, $cc = count($components); $j < $cc; $j ++) { + $y = $components[$j]; + if ($j === 0) { + if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { + $nx = $y; + } else { + // $nx stays null; this matters + // if we don't manage to find + // any valid selector content, + // in which case we ignore the + // outer $delim + } + } elseif ($j % 2) { + // set delimiter + $sdelim = $y; + } else { + $attrdef = null; + if ($sdelim === '#') { + $attrdef = $this->_id_attrdef; + } elseif ($sdelim === '.') { + $attrdef = $this->_class_attrdef; + } elseif ($sdelim === ':') { + $attrdef = $this->_enum_attrdef; + } else { + throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); + } + $r = $attrdef->validate($y, $config, $context); + if ($r !== false) { + if ($r !== true) { + $y = $r; + } + if ($nx === null) { + $nx = ''; + } + $nx .= $sdelim . $y; + } + } + } + if ($nx !== null) { + if ($nsel === null) { + $nsel = $nx; + } else { + $nsel .= $delim . $nx; + } + } else { + // delimiters to the left of invalid + // basic selector ignored + } + } + } + if ($nsel !== null) { + if (!empty($scopes)) { + foreach ($scopes as $s) { + $new_selectors[] = "$s $nsel"; + } + } else { + $new_selectors[] = $nsel; } } - $selector = implode(', ', $new_selector); // now it's a string } + if (empty($new_selectors)) continue; + $selector = implode(', ', $new_selectors); foreach ($style as $name => $value) { if (!isset($css_definition->info[$name])) { unset($style[$name]); @@ -110,10 +262,11 @@ public function cleanCSS($css, $config, $context) { } $new_decls[$selector] = $style; } - $this->_tidy->css[$k] = $new_decls; + $new_css[$k] = $new_decls; } // remove stuff that shouldn't be used, could be reenabled // after security risks are analyzed + $this->_tidy->css = $new_css; $this->_tidy->import = array(); $this->_tidy->charset = null; $this->_tidy->namespace = null; diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php b/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php index 33bb38ac5f43c..b079d44c1349b 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php @@ -147,7 +147,7 @@ public function getAnonymousModule() { return $this->_anonModule; } - private $_anonModule; + private $_anonModule = null; // PUBLIC BUT INTERNAL VARIABLES -------------------------------------- diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php index 44c22f6f8b532..b963529a77a42 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php @@ -35,7 +35,7 @@ public function setup($config) { 'name' => 'CDATA', 'readonly' => 'Bool#readonly', 'size' => 'Number', - 'src' => 'URI#embeds', + 'src' => 'URI#embedded', 'tabindex' => 'Number', 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image', 'value' => 'CDATA', @@ -84,7 +84,8 @@ public function setup($config) { $button->excludes = $this->makeLookup( 'form', 'fieldset', // Form 'input', 'select', 'textarea', 'label', 'button', // Formctrl - 'a' // as per HTML 4.01 spec, this is omitted by modularization + 'a', // as per HTML 4.01 spec, this is omitted by modularization + 'isindex', 'iframe' // legacy items ); // Extra exclusion: img usemap="" is not permitted within this element. diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Iframe.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Iframe.php new file mode 100644 index 0000000000000..287071edf2293 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Iframe.php @@ -0,0 +1,38 @@ +get('HTML.SafeIframe')) { + $this->safe = true; + } + $this->addElement( + 'iframe', 'Inline', 'Flow', 'Common', + array( + 'src' => 'URI#embedded', + 'width' => 'Length', + 'height' => 'Length', + 'name' => 'ID', + 'scrolling' => 'Enum#yes,no,auto', + 'frameborder' => 'Enum#0,1', + 'longdesc' => 'URI', + 'marginheight' => 'Pixels', + 'marginwidth' => 'Pixels', + ) + ); + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php index df33927ba6b27..f278eeced29ad 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php @@ -89,7 +89,7 @@ public function setup($config) { $hr->attr['width'] = 'Length'; $img = $this->addBlankElement('img'); - $img->attr['align'] = 'Enum#top,middle,bottom,left,right'; + $img->attr['align'] = 'IAlign'; $img->attr['border'] = 'Pixels'; $img->attr['hspace'] = 'Pixels'; $img->attr['vspace'] = 'Pixels'; @@ -136,6 +136,22 @@ public function setup($config) { $ul->attr['compact'] = 'Bool#compact'; $ul->attr['type'] = 'Enum#square,disc,circle'; + // "safe" modifications to "unsafe" elements + // WARNING: If you want to add support for an unsafe, legacy + // attribute, make a new TrustedLegacy module with the trusted + // bit set appropriately + + $form = $this->addBlankElement('form'); + $form->content_model = 'Flow | #PCDATA'; + $form->content_model_type = 'optional'; + $form->attr['target'] = 'FrameTarget'; + + $input = $this->addBlankElement('input'); + $input->attr['align'] = 'IAlign'; + + $legend = $this->addBlankElement('legend'); + $legend->attr['align'] = 'LAlign'; + } } diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php index 74d4522f4e24f..79ccefafd963d 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php @@ -20,10 +20,16 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule public $content_sets = array('Flow' => 'List'); public function setup($config) { - $ol = $this->addElement('ol', 'List', 'Required: li', 'Common'); - $ol->wrap = "li"; - $ul = $this->addElement('ul', 'List', 'Required: li', 'Common'); - $ul->wrap = "li"; + $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); + $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); + // XXX The wrap attribute is handled by MakeWellFormed. This is all + // quite unsatisfactory, because we generated this + // *specifically* for lists, and now a big chunk of the handling + // is done properly by the List ChildDef. So actually, we just + // want enough information to make autoclosing work properly, + // and then hand off the tricky stuff to the ChildDef. + $ol->wrap = 'li'; + $ul->wrap = 'li'; $this->addElement('dl', 'List', 'Required: dt | dd', 'Common'); $this->addElement('li', false, 'Flow', 'Common'); diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php index f314ced3f82ea..45c42bb3e44b9 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php @@ -37,6 +37,9 @@ public function setup($config) { 'abbr' => 'Text', 'colspan' => 'Number', 'rowspan' => 'Number', + // Apparently, as of HTML5 this attribute only applies + // to 'th' elements. + 'scope' => 'Enum#row,col,rowgroup,colgroup', ), $cell_align ); diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TargetBlank.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TargetBlank.php new file mode 100644 index 0000000000000..e1305ec5db0db --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TargetBlank.php @@ -0,0 +1,19 @@ +addBlankElement('a'); + $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank(); + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php b/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php index 362e3b78db595..7a06fc02290af 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php @@ -65,11 +65,11 @@ public function __construct() { 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute', // Unsafe: - 'Scripting', 'Object', 'Forms', + 'Scripting', 'Object', 'Forms', // Sorta legacy, but present in strict: 'Name', ); - $transitional = array('Legacy', 'Target'); + $transitional = array('Legacy', 'Target', 'Iframe'); $xml = array('XMLCommonAttributes'); $non_xml = array('NonXMLCommonAttributes'); @@ -112,7 +112,9 @@ public function __construct() { $this->doctypes->register( 'XHTML 1.1', true, - array_merge($common, $xml, array('Ruby')), + // Iframe is a real XHTML 1.1 module, despite being + // "transitional"! + array_merge($common, $xml, array('Ruby', 'Iframe')), array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1 array(), '-//W3C//DTD XHTML 1.1//EN', @@ -229,6 +231,9 @@ public function setup($config) { if ($config->get('HTML.Nofollow')) { $modules[] = 'Nofollow'; } + if ($config->get('HTML.TargetBlank')) { + $modules[] = 'TargetBlank'; + } // merge in custom modules $modules = array_merge($modules, $this->userModules); @@ -364,6 +369,13 @@ public function getElement($name, $trusted = null) { // :TODO: // non-standalone definitions that don't have a standalone // to merge into could be deferred to the end + // HOWEVER, it is perfectly valid for a non-standalone + // definition to lack a standalone definition, even + // after all processing: this allows us to safely + // specify extra attributes for elements that may not be + // enabled all in one place. In particular, this might + // be the case for trusted elements. WARNING: care must + // be taken that the /extra/ definitions are all safe. continue; } diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php deleted file mode 100644 index 1d358c7b6bea2..0000000000000 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php +++ /dev/null @@ -1,139 +0,0 @@ -tokens = array(); - $this->last_token_was_empty = false; - - $string = $this->normalize($string, $config, $context); - - $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler')); - - $parser = new XML_HTMLSax3(); - $parser->set_object($this); - $parser->set_element_handler('openHandler','closeHandler'); - $parser->set_data_handler('dataHandler'); - $parser->set_escape_handler('escapeHandler'); - - // doesn't seem to work correctly for attributes - $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); - - $parser->parse($string); - - restore_error_handler(); - - return $this->tokens; - - } - - /** - * Open tag event handler, interface is defined by PEAR package. - */ - public function openHandler(&$parser, $name, $attrs, $closed) { - // entities are not resolved in attrs - foreach ($attrs as $key => $attr) { - $attrs[$key] = $this->parseData($attr); - } - if ($closed) { - $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); - $this->last_token_was_empty = true; - } else { - $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs); - } - $this->stack[] = $name; - return true; - } - - /** - * Close tag event handler, interface is defined by PEAR package. - */ - public function closeHandler(&$parser, $name) { - // HTMLSax3 seems to always send empty tags an extra close tag - // check and ignore if you see it: - // [TESTME] to make sure it doesn't overreach - if ($this->last_token_was_empty) { - $this->last_token_was_empty = false; - return true; - } - $this->tokens[] = new HTMLPurifier_Token_End($name); - if (!empty($this->stack)) array_pop($this->stack); - return true; - } - - /** - * Data event handler, interface is defined by PEAR package. - */ - public function dataHandler(&$parser, $data) { - $this->last_token_was_empty = false; - $this->tokens[] = new HTMLPurifier_Token_Text($data); - return true; - } - - /** - * Escaped text handler, interface is defined by PEAR package. - */ - public function escapeHandler(&$parser, $data) { - if (strpos($data, '--') === 0) { - // remove trailing and leading double-dashes - $data = substr($data, 2); - if (strlen($data) >= 2 && substr($data, -2) == "--") { - $data = substr($data, 0, -2); - } - if (isset($this->stack[sizeof($this->stack) - 1]) && - $this->stack[sizeof($this->stack) - 1] == "style") { - $this->tokens[] = new HTMLPurifier_Token_Text($data); - } else { - $this->tokens[] = new HTMLPurifier_Token_Comment($data); - } - $this->last_token_was_empty = false; - } - // CDATA is handled elsewhere, but if it was handled here: - //if (strpos($data, '[CDATA[') === 0) { - // $this->tokens[] = new HTMLPurifier_Token_Text( - // substr($data, 7, strlen($data) - 9) ); - //} - return true; - } - - /** - * An error handler that mutes strict errors - */ - public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) { - if ($errno == E_STRICT) return; - return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext); - } - -} - -// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php b/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php index 6f3a13702d28e..faf00b829136b 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php @@ -1113,7 +1113,7 @@ private function entity() { $entity = $this->character($start, $this->char); $cond = strlen($e_name) > 0; - // The rest of the parsing happens below. + // The rest of the parsing happens bellow. break; // Anything else @@ -1140,7 +1140,7 @@ private function entity() { } $cond = isset($entity); - // The rest of the parsing happens below. + // The rest of the parsing happens bellow. break; } diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php b/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php index 816490b7996ad..92aefd33e2742 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php @@ -11,8 +11,6 @@ abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy */ protected $strategies = array(); - abstract public function __construct(); - public function execute($tokens, $config, $context) { foreach ($this->strategies as $strategy) { $tokens = $strategy->execute($tokens, $config, $context); diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php index cf3a33e406eac..bccaf14d3c53e 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -21,6 +21,9 @@ public function execute($tokens, $config, $context) { // currently only used to determine if comments should be kept $trusted = $config->get('HTML.Trusted'); + $comment_lookup = $config->get('HTML.AllowedComments'); + $comment_regexp = $config->get('HTML.AllowedCommentsRegexp'); + $check_comments = $comment_lookup !== array() || $comment_regexp !== null; $remove_script_contents = $config->get('Core.RemoveScriptContents'); $hidden_elements = $config->get('Core.HiddenElements'); @@ -128,23 +131,37 @@ public function execute($tokens, $config, $context) { if ($textify_comments !== false) { $data = $token->data; $token = new HTMLPurifier_Token_Text($data); - } elseif ($trusted) { - // keep, but perform comment cleaning + } elseif ($trusted || $check_comments) { + // always cleanup comments + $trailing_hyphen = false; if ($e) { // perform check whether or not there's a trailing hyphen if (substr($token->data, -1) == '-') { - $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); + $trailing_hyphen = true; } } $token->data = rtrim($token->data, '-'); $found_double_hyphen = false; while (strpos($token->data, '--') !== false) { - if ($e && !$found_double_hyphen) { - $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); - } - $found_double_hyphen = true; // prevent double-erroring + $found_double_hyphen = true; $token->data = str_replace('--', '-', $token->data); } + if ($trusted || !empty($comment_lookup[trim($token->data)]) || ($comment_regexp !== NULL && preg_match($comment_regexp, trim($token->data)))) { + // OK good + if ($e) { + if ($trailing_hyphen) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); + } + if ($found_double_hyphen) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); + } + } + } else { + if ($e) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); + } + continue; + } } else { // strip comments if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); diff --git a/lib/htmlpurifier/HTMLPurifier/URI.php b/lib/htmlpurifier/HTMLPurifier/URI.php index efdfb2c680bee..f158ef5e304c5 100644 --- a/lib/htmlpurifier/HTMLPurifier/URI.php +++ b/lib/htmlpurifier/HTMLPurifier/URI.php @@ -40,7 +40,7 @@ public function getSchemeObj($config, $context) { } else { // no scheme: retrieve the default one $def = $config->getDefinition('URI'); - $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context); + $scheme_obj = $def->getDefaultScheme($config, $context); if (!$scheme_obj) { // something funky happened to the default scheme object trigger_error( @@ -199,6 +199,44 @@ public function toString() { return $result; } + /** + * Returns true if this URL might be considered a 'local' URL given + * the current context. This is true when the host is null, or + * when it matches the host supplied to the configuration. + * + * Note that this does not do any scheme checking, so it is mostly + * only appropriate for metadata that doesn't care about protocol + * security. isBenign is probably what you actually want. + */ + public function isLocal($config, $context) { + if ($this->host === null) return true; + $uri_def = $config->getDefinition('URI'); + if ($uri_def->host === $this->host) return true; + return false; + } + + /** + * Returns true if this URL should be considered a 'benign' URL, + * that is: + * + * - It is a local URL (isLocal), and + * - It has a equal or better level of security + */ + public function isBenign($config, $context) { + if (!$this->isLocal($config, $context)) return false; + + $scheme_obj = $this->getSchemeObj($config, $context); + if (!$scheme_obj) return false; // conservative approach + + $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context); + if ($current_scheme_obj->secure) { + if (!$scheme_obj->secure) { + return false; + } + } + return true; + } + } // vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/URIDefinition.php b/lib/htmlpurifier/HTMLPurifier/URIDefinition.php index ea2b8fe245877..40e57bb7d8162 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIDefinition.php +++ b/lib/htmlpurifier/HTMLPurifier/URIDefinition.php @@ -27,6 +27,7 @@ public function __construct() { $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal()); $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources()); $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist()); + $this->registerFilter(new HTMLPurifier_URIFilter_SafeIframe()); $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute()); $this->registerFilter(new HTMLPurifier_URIFilter_Munge()); } @@ -52,9 +53,13 @@ protected function doSetup($config) { protected function setupFilters($config) { foreach ($this->registeredFilters as $name => $filter) { - $conf = $config->get('URI.' . $name); - if ($conf !== false && $conf !== null) { + if ($filter->always_load) { $this->addFilter($filter, $config); + } else { + $conf = $config->get('URI.' . $name); + if ($conf !== false && $conf !== null) { + $this->addFilter($filter, $config); + } } } unset($this->registeredFilters); @@ -72,6 +77,10 @@ protected function setupMemberVariables($config) { if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme'); } + public function getDefaultScheme($config, $context) { + return HTMLPurifier_URISchemeRegistry::instance()->getScheme($this->defaultScheme, $config, $context); + } + public function filter(&$uri, $config, $context) { foreach ($this->filters as $name => $f) { $result = $f->filter($uri, $config, $context); diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter.php b/lib/htmlpurifier/HTMLPurifier/URIFilter.php index c116f93dffc02..6a1b0b08e4b26 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIFilter.php +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter.php @@ -4,7 +4,21 @@ * Chainable filters for custom URI processing. * * These filters can perform custom actions on a URI filter object, - * including transformation or blacklisting. + * including transformation or blacklisting. A filter named Foo + * must have a corresponding configuration directive %URI.Foo, + * unless always_load is specified to be true. + * + * The following contexts may be available while URIFilters are being + * processed: + * + * - EmbeddedURI: true if URI is an embedded resource that will + * be loaded automatically on page load + * - CurrentToken: a reference to the token that is currently + * being processed + * - CurrentAttr: the name of the attribute that is currently being + * processed + * - CurrentCSSProperty: the name of the CSS property that is + * currently being processed (if applicable) * * @warning This filter is called before scheme object validation occurs. * Make sure, if you require a specific scheme object, you @@ -25,7 +39,15 @@ abstract class HTMLPurifier_URIFilter public $post = false; /** - * Performs initialization for the filter + * True if this filter should always be loaded (this permits + * a filter to be named Foo without the corresponding %URI.Foo + * directive existing.) + */ + public $always_load = false; + + /** + * Performs initialization for the filter. If the filter returns + * false, this means that it shouldn't be considered active. */ public function prepare($config) {return true;} diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php index 045aa0992c662..55fde3bf4d37c 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php @@ -1,5 +1,9 @@ getSchemeObj($config, $context); if (!$scheme_obj) return true; // ignore unknown schemes, maybe another postfilter did it - if (is_null($uri->host) || empty($scheme_obj->browsable)) { - return true; - } - // don't redirect if target host is our host - if ($uri->host === $config->getDefinition('URI')->host) { - return true; - } + if (!$scheme_obj->browsable) return true; // ignore non-browseable schemes, since we can't munge those in a reasonable way + if ($uri->isBenign($config, $context)) return true; // don't redirect if a benign URL $this->makeReplace($uri, $config, $context); $this->replace = array_map('rawurlencode', $this->replace); diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/SafeIframe.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/SafeIframe.php new file mode 100644 index 0000000000000..284bb13de29b2 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/SafeIframe.php @@ -0,0 +1,35 @@ +regexp = $config->get('URI.SafeIframeRegexp'); + return true; + } + public function filter(&$uri, $config, $context) { + // check if filter not applicable + if (!$config->get('HTML.SafeIframe')) return true; + // check if the filter should actually trigger + if (!$context->get('EmbeddedURI', true)) return true; + $token = $context->get('CurrentToken', true); + if (!($token && $token->name == 'iframe')) return true; + // check if we actually have some whitelists enabled + if ($this->regexp === null) return false; + // actually check the whitelists + return preg_match($this->regexp, $uri->toString()); + } +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme.php b/lib/htmlpurifier/HTMLPurifier/URIScheme.php index 25eb8410b4f41..7be958143ae37 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIScheme.php +++ b/lib/htmlpurifier/HTMLPurifier/URIScheme.php @@ -19,6 +19,12 @@ abstract class HTMLPurifier_URIScheme */ public $browsable = false; + /** + * Whether or not data transmitted over this scheme is encrypted. + * https is secure, http is not. + */ + public $secure = false; + /** * Whether or not the URI always uses , resolves edge cases * with making relative URIs absolute diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php index 29e380919f0d7..159c2874eaf43 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php +++ b/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php @@ -6,6 +6,7 @@ class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http { public $default_port = 443; + public $secure = true; } diff --git a/lib/htmlpurifier/readme_moodle.txt b/lib/htmlpurifier/readme_moodle.txt index a71fd43bc5398..7d4c4e8393d13 100644 --- a/lib/htmlpurifier/readme_moodle.txt +++ b/lib/htmlpurifier/readme_moodle.txt @@ -1,4 +1,5 @@ -Description of HTML Purifier v4.3.0 library import into Moodle +Description of HTML Purifier v4.4.0+ library import into Moodle +(Includes _blank fix 8c9d461a6259d7b8ac4ae5c9c9e95ab176cfcda3.) No Changes.. diff --git a/lib/thirdpartylibs.xml b/lib/thirdpartylibs.xml index 456ce2711269c..59e4a5195d47f 100644 --- a/lib/thirdpartylibs.xml +++ b/lib/thirdpartylibs.xml @@ -67,7 +67,7 @@ htmlpurifier HTML Purifier LGPL - 4.3.0 + 4.4.0 2.1+