Skip to content
Browse files

Merge branch 'w10_MDL-31785_m21_htmlpurifier' of git://github.com/sko…

…dak/moodle into MOODLE_21_STABLE
  • Loading branch information...
2 parents 7083c09 + 4c05d4a commit 0b9c121a8c5fb69a575cb724d556e7986c358d9a @stronk7 stronk7 committed Mar 6, 2012
Showing with 1,176 additions and 378 deletions.
  1. +3 −3 lib/htmlpurifier/HTMLPurifier.php
  2. +7 −0 lib/htmlpurifier/HTMLPurifier.safe-includes.php
  3. +24 −0 lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Ident.php
  4. +28 −0 lib/htmlpurifier/HTMLPurifier/AttrDef/Clone.php
  5. +1 −1 lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php
  6. +16 −6 lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php
  7. +1 −1 lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php
  8. +39 −6 lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php
  9. +6 −2 lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php
  10. +38 −0 lib/htmlpurifier/HTMLPurifier/AttrTransform/TargetBlank.php
  11. +14 −0 lib/htmlpurifier/HTMLPurifier/AttrTypes.php
  12. +120 −0 lib/htmlpurifier/HTMLPurifier/ChildDef/List.php
  13. +90 −5 lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php
  14. +3 −3 lib/htmlpurifier/HTMLPurifier/Config.php
  15. BIN lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser
  16. +2 −1 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt
  17. +9 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt
  18. +10 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt
  19. +15 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt
  20. +13 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt
  21. +8 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt
  22. +22 −0 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt
  23. +136 −22 lib/htmlpurifier/HTMLPurifier/Encoder.php
  24. +164 −11 lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php
  25. +1 −1 lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php
  26. +3 −2 lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php
  27. +38 −0 lib/htmlpurifier/HTMLPurifier/HTMLModule/Iframe.php
  28. +17 −1 lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php
  29. +10 −4 lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php
  30. +3 −0 lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php
  31. +19 −0 lib/htmlpurifier/HTMLPurifier/HTMLModule/TargetBlank.php
  32. +15 −3 lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php
  33. +0 −139 lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php
  34. +2 −2 lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php
  35. +0 −2 lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php
  36. +24 −7 lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php
  37. +39 −1 lib/htmlpurifier/HTMLPurifier/URI.php
  38. +11 −2 lib/htmlpurifier/HTMLPurifier/URIDefinition.php
  39. +24 −2 lib/htmlpurifier/HTMLPurifier/URIFilter.php
  40. +4 −0 lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php
  41. +2 −7 lib/htmlpurifier/HTMLPurifier/URIFilter/Munge.php
  42. +35 −0 lib/htmlpurifier/HTMLPurifier/URIFilter/SafeIframe.php
  43. +6 −0 lib/htmlpurifier/HTMLPurifier/URIScheme.php
  44. +1 −0 lib/htmlpurifier/HTMLPurifier/URIScheme/https.php
  45. +2 −1 lib/htmlpurifier/readme_moodle.txt
  46. +150 −9 lib/simpletest/testhtmlpurifier.php
  47. +0 −133 lib/simpletest/testpurifier.php
  48. +1 −1 lib/thirdpartylibs.xml
View
6 lib/htmlpurifier/HTMLPurifier.php
@@ -19,7 +19,7 @@
*/
/*
- HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
+ HTML Purifier 4.4.0 - Standards Compliant HTML Filtering
Copyright (C) 2006-2008 Edward Z. Yang
This library is free software; you can redistribute it and/or
@@ -55,10 +55,10 @@ class HTMLPurifier
{
/** Version of HTML Purifier */
- public $version = '4.3.0';
+ public $version = '4.4.0';
/** Constant with version of HTML Purifier */
- const VERSION = '4.3.0';
+ const VERSION = '4.4.0';
/** Global configuration object */
public $config;
View
7 lib/htmlpurifier/HTMLPurifier.safe-includes.php
@@ -67,6 +67,7 @@
require_once $__dir . '/HTMLPurifier/VarParser.php';
require_once $__dir . '/HTMLPurifier/VarParserException.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS.php';
+require_once $__dir . '/HTMLPurifier/AttrDef/Clone.php';
require_once $__dir . '/HTMLPurifier/AttrDef/Enum.php';
require_once $__dir . '/HTMLPurifier/AttrDef/Integer.php';
require_once $__dir . '/HTMLPurifier/AttrDef/Lang.php';
@@ -84,6 +85,7 @@
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Filter.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Font.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/FontFamily.php';
+require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Ident.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Length.php';
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ListStyle.php';
@@ -124,10 +126,12 @@
require_once $__dir . '/HTMLPurifier/AttrTransform/SafeObject.php';
require_once $__dir . '/HTMLPurifier/AttrTransform/SafeParam.php';
require_once $__dir . '/HTMLPurifier/AttrTransform/ScriptRequired.php';
+require_once $__dir . '/HTMLPurifier/AttrTransform/TargetBlank.php';
require_once $__dir . '/HTMLPurifier/AttrTransform/Textarea.php';
require_once $__dir . '/HTMLPurifier/ChildDef/Chameleon.php';
require_once $__dir . '/HTMLPurifier/ChildDef/Custom.php';
require_once $__dir . '/HTMLPurifier/ChildDef/Empty.php';
+require_once $__dir . '/HTMLPurifier/ChildDef/List.php';
require_once $__dir . '/HTMLPurifier/ChildDef/Required.php';
require_once $__dir . '/HTMLPurifier/ChildDef/Optional.php';
require_once $__dir . '/HTMLPurifier/ChildDef/StrictBlockquote.php';
@@ -142,6 +146,7 @@
require_once $__dir . '/HTMLPurifier/HTMLModule/Edit.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Forms.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Hypertext.php';
+require_once $__dir . '/HTMLPurifier/HTMLModule/Iframe.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Image.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Legacy.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/List.php';
@@ -158,6 +163,7 @@
require_once $__dir . '/HTMLPurifier/HTMLModule/StyleAttribute.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Tables.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Target.php';
+require_once $__dir . '/HTMLPurifier/HTMLModule/TargetBlank.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Text.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy.php';
require_once $__dir . '/HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
@@ -196,6 +202,7 @@
require_once $__dir . '/HTMLPurifier/URIFilter/HostBlacklist.php';
require_once $__dir . '/HTMLPurifier/URIFilter/MakeAbsolute.php';
require_once $__dir . '/HTMLPurifier/URIFilter/Munge.php';
+require_once $__dir . '/HTMLPurifier/URIFilter/SafeIframe.php';
require_once $__dir . '/HTMLPurifier/URIScheme/data.php';
require_once $__dir . '/HTMLPurifier/URIScheme/file.php';
require_once $__dir . '/HTMLPurifier/URIScheme/ftp.php';
View
24 lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Ident.php
@@ -0,0 +1,24 @@
+<?php
+
+/**
+ * Validates based on {ident} CSS grammar production
+ */
+class HTMLPurifier_AttrDef_CSS_Ident extends HTMLPurifier_AttrDef
+{
+
+ public function validate($string, $config, $context) {
+
+ $string = trim($string);
+
+ // early abort: '' and '0' (strings that convert to false) are invalid
+ if (!$string) return false;
+
+ $pattern = '/^(-?[A-Za-z_][A-Za-z_\-0-9]*)$/';
+ if (!preg_match($pattern, $string)) return false;
+ return $string;
+
+ }
+
+}
+
+// vim: et sw=4 sts=4
View
28 lib/htmlpurifier/HTMLPurifier/AttrDef/Clone.php
@@ -0,0 +1,28 @@
+<?php
+
+/**
+ * Dummy AttrDef that mimics another AttrDef, BUT it generates clones
+ * with make.
+ */
+class HTMLPurifier_AttrDef_Clone extends HTMLPurifier_AttrDef
+{
+ /**
+ * What we're cloning
+ */
+ protected $clone;
+
+ public function __construct($clone) {
+ $this->clone = $clone;
+ }
+
+ public function validate($v, $config, $context) {
+ return $this->clone->validate($v, $config, $context);
+ }
+
+ public function make($string) {
+ return clone $this->clone;
+ }
+
+}
+
+// vim: et sw=4 sts=4
View
2 lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Color.php
@@ -14,7 +14,7 @@ public function validate($string, $config, $context) {
$string = trim($string);
if (empty($string)) return false;
- if (isset($colors[$string])) return $colors[$string];
+ if (isset($colors[strtolower($string)])) return $colors[$string];
if ($string[0] === '#') $hex = substr($string, 1);
else $hex = $string;
View
22 lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php
@@ -12,12 +12,22 @@
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
{
- // ref functionality disabled, since we also have to verify
- // whether or not the ID it refers to exists
+ // selector is NOT a valid thing to use for IDREFs, because IDREFs
+ // *must* target IDs that exist, whereas selector #ids do not.
+
+ /**
+ * Determines whether or not we're validating an ID in a CSS
+ * selector context.
+ */
+ protected $selector;
+
+ public function __construct($selector = false) {
+ $this->selector = $selector;
+ }
public function validate($id, $config, $context) {
- if (!$config->get('Attr.EnableID')) return false;
+ if (!$this->selector && !$config->get('Attr.EnableID')) return false;
$id = trim($id); // trim it first
@@ -33,10 +43,10 @@ public function validate($id, $config, $context) {
'%Attr.IDPrefix is set', E_USER_WARNING);
}
- //if (!$this->ref) {
+ if (!$this->selector) {
$id_accumulator =& $context->get('IDAccumulator');
if (isset($id_accumulator->ids[$id])) return false;
- //}
+ }
// we purposely avoid using regex, hopefully this is faster
@@ -56,7 +66,7 @@ public function validate($id, $config, $context) {
return false;
}
- if (/*!$this->ref && */$result) $id_accumulator->add($id);
+ if (!$this->selector && $result) $id_accumulator->add($id);
// if no change was made to the ID, return the result
// else, return the new id if stripping whitespace made it
View
2 lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php
@@ -19,7 +19,7 @@ public function __construct($embeds_resource = false) {
}
public function make($string) {
- $embeds = (bool) $string;
+ $embeds = ($string === 'embedded');
return new HTMLPurifier_AttrDef_URI($embeds);
}
View
45 lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php
@@ -44,9 +44,8 @@ public function validate($string, $config, $context) {
// A regular domain name.
- // This breaks I18N domain names, but we don't have proper IRI support,
- // so force users to insert Punycode. If there's complaining we'll
- // try to fix things into an international friendly form.
+ // This doesn't match I18N domain names, but we don't have proper IRI support,
+ // so force users to insert Punycode.
// The productions describing this are:
$a = '[a-z]'; // alpha
@@ -57,10 +56,44 @@ public function validate($string, $config, $context) {
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
$toplabel = "$a($and*$an)?";
// hostname = *( domainlabel "." ) toplabel [ "." ]
- $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
- if (!$match) return false;
+ if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
+ return $string;
+ }
+
+ // If we have Net_IDNA2 support, we can support IRIs by
+ // punycoding them. (This is the most portable thing to do,
+ // since otherwise we have to assume browsers support
+
+ if ($config->get('Core.EnableIDNA')) {
+ $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
+ // we need to encode each period separately
+ $parts = explode('.', $string);
+ try {
+ $new_parts = array();
+ foreach ($parts as $part) {
+ $encodable = false;
+ for ($i = 0, $c = strlen($part); $i < $c; $i++) {
+ if (ord($part[$i]) > 0x7a) {
+ $encodable = true;
+ break;
+ }
+ }
+ if (!$encodable) {
+ $new_parts[] = $part;
+ } else {
+ $new_parts[] = $idna->encode($part);
+ }
+ }
+ $string = implode('.', $new_parts);
+ if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
+ return $string;
+ }
+ } catch (Exception $e) {
+ // XXX error reporting
+ }
+ }
- return $string;
+ return false;
}
}
View
8 lib/htmlpurifier/HTMLPurifier/AttrTransform/Nofollow.php
@@ -24,9 +24,13 @@ public function transform($attr, $config, $context) {
$url = $this->parser->parse($attr['href']);
$scheme = $url->getSchemeObj($config, $context);
- if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
+ if ($scheme->browsable && !$url->isLocal($config, $context)) {
if (isset($attr['rel'])) {
- $attr['rel'] .= ' nofollow';
+ $rels = explode(' ', $attr);
+ if (!in_array('nofollow', $rels)) {
+ $rels[] = 'nofollow';
+ }
+ $attr['rel'] = implode(' ', $rels);
} else {
$attr['rel'] = 'nofollow';
}
View
38 lib/htmlpurifier/HTMLPurifier/AttrTransform/TargetBlank.php
@@ -0,0 +1,38 @@
+<?php
+
+// must be called POST validation
+
+/**
+ * Adds target="blank" to all outbound links. This transform is
+ * only attached if Attr.TargetBlank is TRUE. This works regardless
+ * of whether or not Attr.AllowedFrameTargets
+ */
+class HTMLPurifier_AttrTransform_TargetBlank extends HTMLPurifier_AttrTransform
+{
+ private $parser;
+
+ public function __construct() {
+ $this->parser = new HTMLPurifier_URIParser();
+ }
+
+ public function transform($attr, $config, $context) {
+
+ if (!isset($attr['href'])) {
+ return $attr;
+ }
+
+ // XXX Kind of inefficient
+ $url = $this->parser->parse($attr['href']);
+ $scheme = $url->getSchemeObj($config, $context);
+
+ if ($scheme->browsable && !$url->isBenign($config, $context)) {
+ $attr['target'] = '_blank';
+ }
+
+ return $attr;
+
+ }
+
+}
+
+// vim: et sw=4 sts=4
View
14 lib/htmlpurifier/HTMLPurifier/AttrTypes.php
@@ -15,6 +15,13 @@ class HTMLPurifier_AttrTypes
* types.
*/
public function __construct() {
+ // XXX This is kind of poor, since we don't actually /clone/
+ // instances; instead, we use the supplied make() attribute. So,
+ // the underlying class must know how to deal with arguments.
+ // With the old implementation of Enum, that ignored its
+ // arguments when handling a make dispatch, the IAlign
+ // definition wouldn't work.
+
// pseudo-types, must be instantiated via shorthand
$this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
$this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
@@ -29,6 +36,9 @@ public function __construct() {
$this->info['URI'] = new HTMLPurifier_AttrDef_URI();
$this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
$this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
+ $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right');
+ $this->info['LAlign'] = self::makeEnum('top,bottom,left,right');
+ $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget();
// unimplemented aliases
$this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
@@ -44,6 +54,10 @@ public function __construct() {
$this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
}
+ private static function makeEnum($in) {
+ return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in)));
+ }
+
/**
* Retrieves a type
* @param $type String type name
View
120 lib/htmlpurifier/HTMLPurifier/ChildDef/List.php
@@ -0,0 +1,120 @@
+<?php
+
+/**
+ * Definition for list containers ul and ol.
+ */
+class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
+{
+ public $type = 'list';
+ // lying a little bit, so that we can handle ul and ol ourselves
+ // XXX: This whole business with 'wrap' is all a bit unsatisfactory
+ public $elements = array('li' => true, 'ul' => true, 'ol' => true);
+ public function validateChildren($tokens_of_children, $config, $context) {
+ // Flag for subclasses
+ $this->whitespace = false;
+
+ // if there are no tokens, delete parent node
+ if (empty($tokens_of_children)) return false;
+
+ // the new set of children
+ $result = array();
+
+ // current depth into the nest
+ $nesting = 0;
+
+ // a little sanity check to make sure it's not ALL whitespace
+ $all_whitespace = true;
+
+ $seen_li = false;
+ $need_close_li = false;
+
+ foreach ($tokens_of_children as $token) {
+ if (!empty($token->is_whitespace)) {
+ $result[] = $token;
+ continue;
+ }
+ $all_whitespace = false; // phew, we're not talking about whitespace
+
+ if ($nesting == 1 && $need_close_li) {
+ $result[] = new HTMLPurifier_Token_End('li');
+ $nesting--;
+ $need_close_li = false;
+ }
+
+ $is_child = ($nesting == 0);
+
+ if ($token instanceof HTMLPurifier_Token_Start) {
+ $nesting++;
+ } elseif ($token instanceof HTMLPurifier_Token_End) {
+ $nesting--;
+ }
+
+ if ($is_child) {
+ if ($token->name === 'li') {
+ // good
+ $seen_li = true;
+ } elseif ($token->name === 'ul' || $token->name === 'ol') {
+ // we want to tuck this into the previous li
+ $need_close_li = true;
+ $nesting++;
+ if (!$seen_li) {
+ // create a new li element
+ $result[] = new HTMLPurifier_Token_Start('li');
+ } else {
+ // backtrack until </li> found
+ while(true) {
+ $t = array_pop($result);
+ if ($t instanceof HTMLPurifier_Token_End) {
+ // XXX actually, these invariants could very plausibly be violated
+ // if we are doing silly things with modifying the set of allowed elements.
+ // FORTUNATELY, it doesn't make a difference, since the allowed
+ // elements are hard-coded here!
+ if ($t->name !== 'li') {
+ trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
+ return false;
+ }
+ break;
+ } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh
+ if ($t->name !== 'li') {
+ trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
+ return false;
+ }
+ // XXX this should have a helper for it...
+ $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor);
+ break;
+ } else {
+ if (!$t->is_whitespace) {
+ trigger_error("Only whitespace present invariant violated in List ChildDef", E_USER_ERROR);
+ return false;
+ }
+ }
+ }
+ }
+ } else {
+ // start wrapping (this doesn't precisely mimic
+ // browser behavior, but what browsers do is kind of
+ // hard to mimic in a standards compliant way
+ // XXX Actually, this has no impact in practice,
+ // because this gets handled earlier. Arguably,
+ // we should rip out all of that processing
+ $result[] = new HTMLPurifier_Token_Start('li');
+ $nesting++;
+ $seen_li = true;
+ $need_close_li = true;
+ }
+ }
+ $result[] = $token;
+ }
+ if ($need_close_li) {
+ $result[] = new HTMLPurifier_Token_End('li');
+ }
+ if (empty($result)) return false;
+ if ($all_whitespace) {
+ return false;
+ }
+ if ($tokens_of_children == $result) return true;
+ return $result;
+ }
+}
+
+// vim: et sw=4 sts=4
View
95 lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php
@@ -1,7 +1,33 @@
<?php
/**
- * Definition for tables
+ * Definition for tables. The general idea is to extract out all of the
+ * essential bits, and then reconstruct it later.
+ *
+ * This is a bit confusing, because the DTDs and the W3C
+ * validators seem to disagree on the appropriate definition. The
+ * DTD claims:
+ *
+ * (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+ *
+ * But actually, the HTML4 spec then has this to say:
+ *
+ * The TBODY start tag is always required except when the table
+ * contains only one table body and no table head or foot sections.
+ * The TBODY end tag may always be safely omitted.
+ *
+ * So the DTD is kind of wrong. The validator is, unfortunately, kind
+ * of on crack.
+ *
+ * The definition changed again in XHTML1.1; and in my opinion, this
+ * formulation makes the most sense.
+ *
+ * caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
+ *
+ * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
+ * If we encounter a thead, tfoot or tbody, we are placed in the former
+ * mode, and we *must* wrap any stray tr segments with a tbody. But if
+ * we don't run into any of them, just have tr tags is OK.
*/
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
{
@@ -33,6 +59,8 @@ public function validateChildren($tokens_of_children, $config, $context) {
$collection = array(); // collected nodes
$tag_index = 0; // the first node might be whitespace,
// so this tells us where the start tag is
+ $tbody_mode = false; // if true, then we need to wrap any stray
+ // <tr>s with a <tbody>.
foreach ($tokens_of_children as $token) {
$is_child = ($nesting == 0);
@@ -51,8 +79,9 @@ public function validateChildren($tokens_of_children, $config, $context) {
// okay, let's stash the tokens away
// first token tells us the type of the collection
switch ($collection[$tag_index]->name) {
- case 'tr':
case 'tbody':
+ $tbody_mode = true;
+ case 'tr':
$content[] = $collection;
break;
case 'caption':
@@ -61,13 +90,28 @@ public function validateChildren($tokens_of_children, $config, $context) {
break;
case 'thead':
case 'tfoot':
+ $tbody_mode = true;
+ // XXX This breaks rendering properties with
+ // Firefox, which never floats a <thead> to
+ // the top. Ever. (Our scheme will float the
+ // first <thead> to the top.) So maybe
+ // <thead>s that are not first should be
+ // turned into <tbody>? Very tricky, indeed.
+
// access the appropriate variable, $thead or $tfoot
$var = $collection[$tag_index]->name;
if ($$var === false) {
$$var = $collection;
} else {
- // transmutate the first and less entries into
- // tbody tags, and then put into content
+ // Oops, there's a second one! What
+ // should we do? Current behavior is to
+ // transmutate the first and last entries into
+ // tbody tags, and then put into content.
+ // Maybe a better idea is to *attach
+ // it* to the existing thead or tfoot?
+ // We don't do this, because Firefox
+ // doesn't float an extra tfoot to the
+ // bottom like it does for the first one.
$collection[$tag_index]->name = 'tbody';
$collection[count($collection)-1]->name = 'tbody';
$content[] = $collection;
@@ -126,7 +170,48 @@ public function validateChildren($tokens_of_children, $config, $context) {
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
if ($thead !== false) $ret = array_merge($ret, $thead);
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
- foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+
+ if ($tbody_mode) {
+ // a little tricky, since the start of the collection may be
+ // whitespace
+ $inside_tbody = false;
+ foreach ($content as $token_array) {
+ // find the starting token
+ foreach ($token_array as $t) {
+ if ($t->name === 'tr' || $t->name === 'tbody') {
+ break;
+ }
+ } // iterator variable carries over
+ if ($t->name === 'tr') {
+ if ($inside_tbody) {
+ $ret = array_merge($ret, $token_array);
+ } else {
+ $ret[] = new HTMLPurifier_Token_Start('tbody');
+ $ret = array_merge($ret, $token_array);
+ $inside_tbody = true;
+ }
+ } elseif ($t->name === 'tbody') {
+ if ($inside_tbody) {
+ $ret[] = new HTMLPurifier_Token_End('tbody');
+ $inside_tbody = false;
+ $ret = array_merge($ret, $token_array);
+ } else {
+ $ret = array_merge($ret, $token_array);
+ }
+ } else {
+ trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
+ }
+ }
+ if ($inside_tbody) {
+ $ret[] = new HTMLPurifier_Token_End('tbody');
+ }
+ } else {
+ foreach ($content as $token_array) {
+ // invariant: everything in here is <tr>s
+ $ret = array_merge($ret, $token_array);
+ }
+ }
+
if (!empty($collection) && $is_collecting == false){
// grab the trailing space
$ret = array_merge($ret, $collection);
View
6 lib/htmlpurifier/HTMLPurifier/Config.php
@@ -20,7 +20,7 @@ class HTMLPurifier_Config
/**
* HTML Purifier's version
*/
- public $version = '4.3.0';
+ public $version = '4.4.0';
/**
* Bool indicator whether or not to automatically finalize
@@ -44,7 +44,7 @@ class HTMLPurifier_Config
/**
* Parser for variables
*/
- protected $parser;
+ protected $parser = null;
/**
* Reference HTMLPurifier_ConfigSchema for value checking
@@ -668,7 +668,7 @@ public function autoFinalize() {
*/
public function finalize() {
$this->finalized = true;
- unset($this->parser);
+ $this->parser = null;
}
/**
View
BIN lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser
Binary file not shown.
View
3 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt
@@ -24,5 +24,6 @@ array (
--DESCRIPTION--
Lookup array of color names to six digit hexadecimal number corresponding
-to color, with preceding hash mark. Used when parsing colors.
+to color, with preceding hash mark. Used when parsing colors. The lookup
+is done in a case-insensitive manner.
--# vim: et sw=4 sts=4
View
9 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt
@@ -0,0 +1,9 @@
+Core.EnableIDNA
+TYPE: bool
+DEFAULT: false
+VERSION: 4.4.0
+--DESCRIPTION--
+Allows international domain names in URLs. This configuration option
+requires the PEAR Net_IDNA2 module to be installed. It operates by
+punycoding any internationalized host names for maximum portability.
+--# vim: et sw=4 sts=4
View
10 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt
@@ -0,0 +1,10 @@
+HTML.AllowedComments
+TYPE: lookup
+VERSION: 4.4.0
+DEFAULT: array()
+--DESCRIPTION--
+A whitelist which indicates what explicit comment bodies should be
+allowed, modulo leading and trailing whitespace. See also %HTML.AllowedCommentsRegexp
+(these directives are union'ed together, so a comment is considered
+valid if any directive deems it valid.)
+--# vim: et sw=4 sts=4
View
15 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt
@@ -0,0 +1,15 @@
+HTML.AllowedCommentsRegexp
+TYPE: string/null
+VERSION: 4.4.0
+DEFAULT: NULL
+--DESCRIPTION--
+A regexp, which if it matches the body of a comment, indicates that
+it should be allowed. Trailing and leading spaces are removed prior
+to running this regular expression.
+<strong>Warning:</strong> Make sure you specify
+correct anchor metacharacters <code>^regex$</code>, otherwise you may accept
+comments that you did not mean to! In particular, the regex <code>/foo|bar/</code>
+is probably not sufficiently strict, since it also allows <code>foobar</code>.
+See also %HTML.AllowedComments (these directives are union'ed together,
+so a comment is considered valid if any directive deems it valid.)
+--# vim: et sw=4 sts=4
View
13 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt
@@ -0,0 +1,13 @@
+HTML.SafeIframe
+TYPE: bool
+VERSION: 4.4.0
+DEFAULT: false
+--DESCRIPTION--
+<p>
+ Whether or not to permit iframe tags in untrusted documents. This
+ directive must be accompanied by a whitelist of permitted iframes,
+ such as %URI.SafeIframeRegexp, otherwise it will fatally error.
+ This directive has no effect on strict doctypes, as iframes are not
+ valid.
+</p>
+--# vim: et sw=4 sts=4
View
8 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt
@@ -0,0 +1,8 @@
+HTML.TargetBlank
+TYPE: bool
+VERSION: 4.4.0
+DEFAULT: FALSE
+--DESCRIPTION--
+If enabled, <code>target=blank</code> attributes are added to all outgoing links.
+(This includes links from an HTTPS version of a page to an HTTP version.)
+--# vim: et sw=4 sts=4
View
22 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt
@@ -0,0 +1,22 @@
+URI.SafeIframeRegexp
+TYPE: string/null
+VERSION: 4.4.0
+DEFAULT: NULL
+--DESCRIPTION--
+<p>
+ A PCRE regular expression that will be matched against an iframe URI. This is
+ a relatively inflexible scheme, but works well enough for the most common
+ use-case of iframes: embedded video. This directive only has an effect if
+ %HTML.SafeIframe is enabled. Here are some example values:
+</p>
+<ul>
+ <li><code>%^http://www.youtube.com/embed/%</code> - Allow YouTube videos</li>
+ <li><code>%^http://player.vimeo.com/video/%</code> - Allow Vimeo videos</li>
+ <li><code>%^http://(www.youtube.com/embed/|player.vimeo.com/video/)%</code> - Allow both</li>
+</ul>
+<p>
+ Note that this directive does not give you enough granularity to, say, disable
+ all <code>autoplay</code> videos. Pipe up on the HTML Purifier forums if this
+ is a capability you want.
+</p>
+--# vim: et sw=4 sts=4
View
158 lib/htmlpurifier/HTMLPurifier/Encoder.php
@@ -20,6 +20,68 @@ private function __construct() {
public static function muteErrorHandler() {}
/**
+ * iconv wrapper which mutes errors, but doesn't work around bugs.
+ */
+ public static function unsafeIconv($in, $out, $text) {
+ set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+ $r = iconv($in, $out, $text);
+ restore_error_handler();
+ return $r;
+ }
+
+ /**
+ * iconv wrapper which mutes errors and works around bugs.
+ */
+ public static function iconv($in, $out, $text, $max_chunk_size = 8000) {
+ $code = self::testIconvTruncateBug();
+ if ($code == self::ICONV_OK) {
+ return self::unsafeIconv($in, $out, $text);
+ } elseif ($code == self::ICONV_TRUNCATES) {
+ // we can only work around this if the input character set
+ // is utf-8
+ if ($in == 'utf-8') {
+ if ($max_chunk_size < 4) {
+ trigger_error('max_chunk_size is too small', E_USER_WARNING);
+ return false;
+ }
+ // split into 8000 byte chunks, but be careful to handle
+ // multibyte boundaries properly
+ if (($c = strlen($text)) <= $max_chunk_size) {
+ return self::unsafeIconv($in, $out, $text);
+ }
+ $r = '';
+ $i = 0;
+ while (true) {
+ if ($i + $max_chunk_size >= $c) {
+ $r .= self::unsafeIconv($in, $out, substr($text, $i));
+ break;
+ }
+ // wibble the boundary
+ if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
+ $chunk_size = $max_chunk_size;
+ } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
+ $chunk_size = $max_chunk_size - 1;
+ } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
+ $chunk_size = $max_chunk_size - 2;
+ } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
+ $chunk_size = $max_chunk_size - 3;
+ } else {
+ return false; // rather confusing UTF-8...
+ }
+ $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
+ $r .= self::unsafeIconv($in, $out, $chunk);
+ $i += $chunk_size;
+ }
+ return $r;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /**
* Cleans a UTF-8 string for well-formedness and SGML validity
*
* It will parse according to UTF-8 and return a valid UTF8 string, with
@@ -260,32 +322,37 @@ public static function unichr($code) {
return $ret;
}
+ public static function iconvAvailable() {
+ static $iconv = null;
+ if ($iconv === null) {
+ $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
+ }
+ return $iconv;
+ }
+
/**
* Converts a string to UTF-8 based on configuration.
*/
public static function convertToUTF8($str, $config, $context) {
$encoding = $config->get('Core.Encoding');
if ($encoding === 'utf-8') return $str;
static $iconv = null;
- if ($iconv === null) $iconv = function_exists('iconv');
- set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+ if ($iconv === null) $iconv = self::iconvAvailable();
if ($iconv && !$config->get('Test.ForceNoIconv')) {
- $str = iconv($encoding, 'utf-8//IGNORE', $str);
+ // unaffected by bugs, since UTF-8 support all characters
+ $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
if ($str === false) {
// $encoding is not a valid encoding
- restore_error_handler();
trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
return '';
}
// If the string is bjorked by Shift_JIS or a similar encoding
// that doesn't support all of ASCII, convert the naughty
// characters to their true byte-wise ASCII/UTF-8 equivalents.
- $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
- restore_error_handler();
+ $str = strtr($str, self::testEncodingSupportsASCII($encoding));
return $str;
} elseif ($encoding === 'iso-8859-1') {
$str = utf8_encode($str);
- restore_error_handler();
return $str;
}
trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
@@ -298,32 +365,33 @@ public static function convertToUTF8($str, $config, $context) {
*/
public static function convertFromUTF8($str, $config, $context) {
$encoding = $config->get('Core.Encoding');
- if ($encoding === 'utf-8') return $str;
- static $iconv = null;
- if ($iconv === null) $iconv = function_exists('iconv');
if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
- $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
+ $str = self::convertToASCIIDumbLossless($str);
}
- set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+ if ($encoding === 'utf-8') return $str;
+ static $iconv = null;
+ if ($iconv === null) $iconv = self::iconvAvailable();
if ($iconv && !$config->get('Test.ForceNoIconv')) {
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
- $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
+ $ascii_fix = self::testEncodingSupportsASCII($encoding);
if (!$escape && !empty($ascii_fix)) {
$clear_fix = array();
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
$str = strtr($str, $clear_fix);
}
$str = strtr($str, array_flip($ascii_fix));
// Normal stuff
- $str = iconv('utf-8', $encoding . '//IGNORE', $str);
- restore_error_handler();
+ $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
return $str;
} elseif ($encoding === 'iso-8859-1') {
$str = utf8_decode($str);
- restore_error_handler();
return $str;
}
trigger_error('Encoding not supported', E_USER_ERROR);
+ // You might be tempted to assume that the ASCII representation
+ // might be OK, however, this is *not* universally true over all
+ // encodings. So we take the conservative route here, rather
+ // than forcibly turn on %Core.EscapeNonASCIICharacters
}
/**
@@ -373,6 +441,49 @@ public static function convertToASCIIDumbLossless($str) {
return $result;
}
+ /** No bugs detected in iconv. */
+ const ICONV_OK = 0;
+
+ /** Iconv truncates output if converting from UTF-8 to another
+ * character set with //IGNORE, and a non-encodable character is found */
+ const ICONV_TRUNCATES = 1;
+
+ /** Iconv does not support //IGNORE, making it unusable for
+ * transcoding purposes */
+ const ICONV_UNUSABLE = 2;
+
+ /**
+ * glibc iconv has a known bug where it doesn't handle the magic
+ * //IGNORE stanza correctly. In particular, rather than ignore
+ * characters, it will return an EILSEQ after consuming some number
+ * of characters, and expect you to restart iconv as if it were
+ * an E2BIG. Old versions of PHP did not respect the errno, and
+ * returned the fragment, so as a result you would see iconv
+ * mysteriously truncating output. We can work around this by
+ * manually chopping our input into segments of about 8000
+ * characters, as long as PHP ignores the error code. If PHP starts
+ * paying attention to the error code, iconv becomes unusable.
+ *
+ * @returns Error code indicating severity of bug.
+ */
+ public static function testIconvTruncateBug() {
+ static $code = null;
+ if ($code === null) {
+ // better not use iconv, otherwise infinite loop!
+ $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
+ if ($r === false) {
+ $code = self::ICONV_UNUSABLE;
+ } elseif (($c = strlen($r)) < 9000) {
+ $code = self::ICONV_TRUNCATES;
+ } elseif ($c > 9000) {
+ trigger_error('Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()', E_USER_ERROR);
+ } else {
+ $code = self::ICONV_OK;
+ }
+ }
+ return $code;
+ }
+
/**
* This expensive function tests whether or not a given character
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
@@ -385,6 +496,11 @@ public static function convertToASCIIDumbLossless($str) {
* which can be used to "undo" any overzealous iconv action.
*/
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
+ // All calls to iconv here are unsafe, proof by case analysis:
+ // If ICONV_OK, no difference.
+ // If ICONV_TRUNCATE, all calls involve one character inputs,
+ // so bug is not triggered.
+ // If ICONV_UNUSABLE, this call is irrelevant
static $encodings = array();
if (!$bypass) {
if (isset($encodings[$encoding])) return $encodings[$encoding];
@@ -398,24 +514,22 @@ public static function testEncodingSupportsASCII($encoding, $bypass = false) {
if (strpos($lenc, 'iso-8859-') === 0) return array();
}
$ret = array();
- set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
- if (iconv('UTF-8', $encoding, 'a') === false) return false;
+ if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) return false;
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
$c = chr($i); // UTF-8 char
- $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
+ $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
if (
$r === '' ||
// This line is needed for iconv implementations that do not
// omit characters that do not exist in the target character set
- ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
+ ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
) {
// Reverse engineer: what's the UTF-8 equiv of this byte
// sequence? This assumes that there's no variable width
// encoding that doesn't support ASCII.
- $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
+ $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
}
}
- restore_error_handler();
$encodings[$encoding] = $ret;
return $ret;
}
View
175 lib/htmlpurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php
@@ -1,5 +1,11 @@
<?php
+// why is this a top level function? Because PHP 5.2.0 doesn't seem to
+// understand how to interpret this filter if it's a static method.
+// It's all really silly, but if we go this route it might be reasonable
+// to coalesce all of these methods into one.
+function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {}
+
/**
* This filter extracts <style> blocks from input HTML, cleans them up
* using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
@@ -21,8 +27,15 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
private $_styleMatches = array();
private $_tidy;
+ private $_id_attrdef;
+ private $_class_attrdef;
+ private $_enum_attrdef;
+
public function __construct() {
$this->_tidy = new csstidy();
+ $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
+ $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
+ $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
}
/**
@@ -77,27 +90,166 @@ public function cleanCSS($css, $config, $context) {
$css = substr($css, 0, -3);
}
$css = trim($css);
+ set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
$this->_tidy->parse($css);
+ restore_error_handler();
$css_definition = $config->getDefinition('CSS');
+ $html_definition = $config->getDefinition('HTML');
+ $new_css = array();
foreach ($this->_tidy->css as $k => $decls) {
// $decls are all CSS declarations inside an @ selector
$new_decls = array();
foreach ($decls as $selector => $style) {
$selector = trim($selector);
if ($selector === '') continue; // should not happen
- if ($selector[0] === '+') {
- if ($selector !== '' && $selector[0] === '+') continue;
- }
- if (!empty($scopes)) {
- $new_selector = array(); // because multiple ones are possible
- $selectors = array_map('trim', explode(',', $selector));
- foreach ($scopes as $s1) {
- foreach ($selectors as $s2) {
- $new_selector[] = "$s1 $s2";
+ // Parse the selector
+ // Here is the relevant part of the CSS grammar:
+ //
+ // ruleset
+ // : selector [ ',' S* selector ]* '{' ...
+ // selector
+ // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
+ // combinator
+ // : '+' S*
+ // : '>' S*
+ // simple_selector
+ // : element_name [ HASH | class | attrib | pseudo ]*
+ // | [ HASH | class | attrib | pseudo ]+
+ // element_name
+ // : IDENT | '*'
+ // ;
+ // class
+ // : '.' IDENT
+ // ;
+ // attrib
+ // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
+ // [ IDENT | STRING ] S* ]? ']'
+ // ;
+ // pseudo
+ // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
+ // ;
+ //
+ // For reference, here are the relevant tokens:
+ //
+ // HASH #{name}
+ // IDENT {ident}
+ // INCLUDES ==
+ // DASHMATCH |=
+ // STRING {string}
+ // FUNCTION {ident}\(
+ //
+ // And the lexical scanner tokens
+ //
+ // name {nmchar}+
+ // nmchar [_a-z0-9-]|{nonascii}|{escape}
+ // nonascii [\240-\377]
+ // escape {unicode}|\\[^\r\n\f0-9a-f]
+ // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
+ // ident -?{nmstart}{nmchar*}
+ // nmstart [_a-z]|{nonascii}|{escape}
+ // string {string1}|{string2}
+ // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
+ // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
+ //
+ // We'll implement a subset (in order to reduce attack
+ // surface); in particular:
+ //
+ // - No Unicode support
+ // - No escapes support
+ // - No string support (by proxy no attrib support)
+ // - element_name is matched against allowed
+ // elements (some people might find this
+ // annoying...)
+ // - Pseudo-elements one of :first-child, :link,
+ // :visited, :active, :hover, :focus
+
+ // handle ruleset
+ $selectors = array_map('trim', explode(',', $selector));
+ $new_selectors = array();
+ foreach ($selectors as $sel) {
+ // split on +, > and spaces
+ $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
+ // even indices are chunks, odd indices are
+ // delimiters
+ $nsel = null;
+ $delim = null; // guaranteed to be non-null after
+ // two loop iterations
+ for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
+ $x = $basic_selectors[$i];
+ if ($i % 2) {
+ // delimiter
+ if ($x === ' ') {
+ $delim = ' ';
+ } else {
+ $delim = ' ' . $x . ' ';
+ }
+ } else {
+ // simple selector
+ $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
+ $sdelim = null;
+ $nx = null;
+ for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
+ $y = $components[$j];
+ if ($j === 0) {
+ if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
+ $nx = $y;
+ } else {
+ // $nx stays null; this matters
+ // if we don't manage to find
+ // any valid selector content,
+ // in which case we ignore the
+ // outer $delim
+ }
+ } elseif ($j % 2) {
+ // set delimiter
+ $sdelim = $y;
+ } else {
+ $attrdef = null;
+ if ($sdelim === '#') {
+ $attrdef = $this->_id_attrdef;
+ } elseif ($sdelim === '.') {
+ $attrdef = $this->_class_attrdef;
+ } elseif ($sdelim === ':') {
+ $attrdef = $this->_enum_attrdef;
+ } else {
+ throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
+ }
+ $r = $attrdef->validate($y, $config, $context);
+ if ($r !== false) {
+ if ($r !== true) {
+ $y = $r;
+ }
+ if ($nx === null) {
+ $nx = '';
+ }
+ $nx .= $sdelim . $y;
+ }
+ }
+ }
+ if ($nx !== null) {
+ if ($nsel === null) {
+ $nsel = $nx;
+ } else {
+ $nsel .= $delim . $nx;
+ }
+ } else {
+ // delimiters to the left of invalid
+ // basic selector ignored
+ }
+ }
+ }
+ if ($nsel !== null) {
+ if (!empty($scopes)) {
+ foreach ($scopes as $s) {
+ $new_selectors[] = "$s $nsel";
+ }
+ } else {
+ $new_selectors[] = $nsel;
}
}
- $selector = implode(', ', $new_selector); // now it's a string
}
+ if (empty($new_selectors)) continue;
+ $selector = implode(', ', $new_selectors);
foreach ($style as $name => $value) {
if (!isset($css_definition->info[$name])) {
unset($style[$name]);
@@ -110,10 +262,11 @@ public function cleanCSS($css, $config, $context) {
}
$new_decls[$selector] = $style;
}
- $this->_tidy->css[$k] = $new_decls;
+ $new_css[$k] = $new_decls;
}
// remove stuff that shouldn't be used, could be reenabled
// after security risks are analyzed
+ $this->_tidy->css = $new_css;
$this->_tidy->import = array();
$this->_tidy->charset = null;
$this->_tidy->namespace = null;
View
2 lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php
@@ -147,7 +147,7 @@ public function getAnonymousModule() {
return $this->_anonModule;
}
- private $_anonModule;
+ private $_anonModule = null;
// PUBLIC BUT INTERNAL VARIABLES --------------------------------------
View
5 lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php
@@ -35,7 +35,7 @@ public function setup($config) {
'name' => 'CDATA',
'readonly' => 'Bool#readonly',
'size' => 'Number',
- 'src' => 'URI#embeds',
+ 'src' => 'URI#embedded',
'tabindex' => 'Number',
'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
'value' => 'CDATA',
@@ -84,7 +84,8 @@ public function setup($config) {
$button->excludes = $this->makeLookup(
'form', 'fieldset', // Form
'input', 'select', 'textarea', 'label', 'button', // Formctrl
- 'a' // as per HTML 4.01 spec, this is omitted by modularization
+ 'a', // as per HTML 4.01 spec, this is omitted by modularization
+ 'isindex', 'iframe' // legacy items
);
// Extra exclusion: img usemap="" is not permitted within this element.
View
38 lib/htmlpurifier/HTMLPurifier/HTMLModule/Iframe.php
@@ -0,0 +1,38 @@
+<?php
+
+/**
+ * XHTML 1.1 Iframe Module provides inline frames.
+ *
+ * @note This module is not considered safe unless an Iframe
+ * whitelisting mechanism is specified. Currently, the only
+ * such mechanism is %URL.SafeIframeRegexp
+ */
+class HTMLPurifier_HTMLModule_Iframe extends HTMLPurifier_HTMLModule
+{
+
+ public $name = 'Iframe';
+ public $safe = false;
+
+ public function setup($config) {
+ if ($config->get('HTML.SafeIframe')) {
+ $this->safe = true;
+ }
+ $this->addElement(
+ 'iframe', 'Inline', 'Flow', 'Common',
+ array(
+ 'src' => 'URI#embedded',
+ 'width' => 'Length',
+ 'height' => 'Length',
+ 'name' => 'ID',
+ 'scrolling' => 'Enum#yes,no,auto',
+ 'frameborder' => 'Enum#0,1',
+ 'longdesc' => 'URI',
+ 'marginheight' => 'Pixels',
+ 'marginwidth' => 'Pixels',
+ )
+ );
+ }
+
+}
+
+// vim: et sw=4 sts=4
View
18 lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php
@@ -89,7 +89,7 @@ public function setup($config) {
$hr->attr['width'] = 'Length';
$img = $this->addBlankElement('img');
- $img->attr['align'] = 'Enum#top,middle,bottom,left,right';
+ $img->attr['align'] = 'IAlign';
$img->attr['border'] = 'Pixels';
$img->attr['hspace'] = 'Pixels';
$img->attr['vspace'] = 'Pixels';
@@ -136,6 +136,22 @@ public function setup($config) {
$ul->attr['compact'] = 'Bool#compact';
$ul->attr['type'] = 'Enum#square,disc,circle';
+ // "safe" modifications to "unsafe" elements
+ // WARNING: If you want to add support for an unsafe, legacy
+ // attribute, make a new TrustedLegacy module with the trusted
+ // bit set appropriately
+
+ $form = $this->addBlankElement('form');
+ $form->content_model = 'Flow | #PCDATA';
+ $form->content_model_type = 'optional';
+ $form->attr['target'] = 'FrameTarget';
+
+ $input = $this->addBlankElement('input');
+ $input->attr['align'] = 'IAlign';
+
+ $legend = $this->addBlankElement('legend');
+ $legend->attr['align'] = 'LAlign';
+
}
}
View
14 lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php
@@ -20,10 +20,16 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
public $content_sets = array('Flow' => 'List');
public function setup($config) {
- $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
- $ol->wrap = "li";
- $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
- $ul->wrap = "li";
+ $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
+ $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
+ // XXX The wrap attribute is handled by MakeWellFormed. This is all
+ // quite unsatisfactory, because we generated this
+ // *specifically* for lists, and now a big chunk of the handling
+ // is done properly by the List ChildDef. So actually, we just
+ // want enough information to make autoclosing work properly,
+ // and then hand off the tricky stuff to the ChildDef.
+ $ol->wrap = 'li';
+ $ul->wrap = 'li';
$this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
$this->addElement('li', false, 'Flow', 'Common');
View
3 lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php
@@ -37,6 +37,9 @@ public function setup($config) {
'abbr' => 'Text',
'colspan' => 'Number',
'rowspan' => 'Number',
+ // Apparently, as of HTML5 this attribute only applies
+ // to 'th' elements.
+ 'scope' => 'Enum#row,col,rowgroup,colgroup',
),
$cell_align
);
View
19 lib/htmlpurifier/HTMLPurifier/HTMLModule/TargetBlank.php
@@ -0,0 +1,19 @@
+<?php
+
+/**
+ * Module adds the target=blank attribute transformation to a tags. It
+ * is enabled by HTML.TargetBlank
+ */
+class HTMLPurifier_HTMLModule_TargetBlank extends HTMLPurifier_HTMLModule
+{
+
+ public $name = 'TargetBlank';
+
+ public function setup($config) {
+ $a = $this->addBlankElement('a');
+ $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank();
+ }
+
+}
+
+// vim: et sw=4 sts=4
View
18 lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php
@@ -65,11 +65,11 @@ public function __construct() {
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
'StyleAttribute',
// Unsafe:
- 'Scripting', 'Object', 'Forms',
+ 'Scripting', 'Object', 'Forms',
// Sorta legacy, but present in strict:
'Name',
);
- $transitional = array('Legacy', 'Target');
+ $transitional = array('Legacy', 'Target', 'Iframe');
$xml = array('XMLCommonAttributes');
$non_xml = array('NonXMLCommonAttributes');
@@ -112,7 +112,9 @@ public function __construct() {
$this->doctypes->register(
'XHTML 1.1', true,
- array_merge($common, $xml, array('Ruby')),
+ // Iframe is a real XHTML 1.1 module, despite being
+ // "transitional"!
+ array_merge($common, $xml, array('Ruby', 'Iframe')),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
array(),
'-//W3C//DTD XHTML 1.1//EN',
@@ -229,6 +231,9 @@ public function setup($config) {
if ($config->get('HTML.Nofollow')) {
$modules[] = 'Nofollow';
}
+ if ($config->get('HTML.TargetBlank')) {
+ $modules[] = 'TargetBlank';
+ }
// merge in custom modules
$modules = array_merge($modules, $this->userModules);
@@ -364,6 +369,13 @@ public function getElement($name, $trusted = null) {
// :TODO:
// non-standalone definitions that don't have a standalone
// to merge into could be deferred to the end
+ // HOWEVER, it is perfectly valid for a non-standalone
+ // definition to lack a standalone definition, even
+ // after all processing: this allows us to safely
+ // specify extra attributes for elements that may not be
+ // enabled all in one place. In particular, this might
+ // be the case for trusted elements. WARNING: care must
+ // be taken that the /extra/ definitions are all safe.
continue;
}
View
139 lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php
@@ -1,139 +0,0 @@
-<?php
-
-/**
- * Proof-of-concept lexer that uses the PEAR package XML_HTMLSax3 to parse HTML.
- *
- * PEAR, not suprisingly, also has a SAX parser for HTML. I don't know
- * very much about implementation, but it's fairly well written. However, that
- * abstraction comes at a price: performance. You need to have it installed,
- * and if the API changes, it might break our adapter. Not sure whether or not
- * it's UTF-8 aware, but it has some entity parsing trouble (in all areas,
- * text and attributes).
- *
- * Quite personally, I don't recommend using the PEAR class, and the defaults
- * don't use it. The unit tests do perform the tests on the SAX parser too, but
- * whatever it does for poorly formed HTML is up to it.
- *
- * @todo Generalize so that XML_HTMLSax is also supported.
- *
- * @warning Entity-resolution inside attributes is broken.
- */
-
-class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
-{
-
- /**
- * Internal accumulator array for SAX parsers.
- */
- protected $tokens = array();
- protected $last_token_was_empty;
-
- private $parent_handler;
- private $stack = array();
-
- public function tokenizeHTML($string, $config, $context) {
-
- $this->tokens = array();
- $this->last_token_was_empty = false;
-
- $string = $this->normalize($string, $config, $context);
-
- $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler'));
-
- $parser = new XML_HTMLSax3();
- $parser->set_object($this);
- $parser->set_element_handler('openHandler','closeHandler');
- $parser->set_data_handler('dataHandler');
- $parser->set_escape_handler('escapeHandler');
-
- // doesn't seem to work correctly for attributes
- $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
-
- $parser->parse($string);
-
- restore_error_handler();
-
- return $this->tokens;
-
- }
-
- /**
- * Open tag event handler, interface is defined by PEAR package.
- */
- public function openHandler(&$parser, $name, $attrs, $closed) {
- // entities are not resolved in attrs
- foreach ($attrs as $key => $attr) {
- $attrs[$key] = $this->parseData($attr);
- }
- if ($closed) {
- $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
- $this->last_token_was_empty = true;
- } else {
- $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
- }
- $this->stack[] = $name;
- return true;
- }
-
- /**
- * Close tag event handler, interface is defined by PEAR package.
- */
- public function closeHandler(&$parser, $name) {
- // HTMLSax3 seems to always send empty tags an extra close tag
- // check and ignore if you see it:
- // [TESTME] to make sure it doesn't overreach
- if ($this->last_token_was_empty) {
- $this->last_token_was_empty = false;
- return true;
- }
- $this->tokens[] = new HTMLPurifier_Token_End($name);
- if (!empty($this->stack)) array_pop($this->stack);
- return true;
- }
-
- /**
- * Data event handler, interface is defined by PEAR package.
- */
- public function dataHandler(&$parser, $data) {
- $this->last_token_was_empty = false;
- $this->tokens[] = new HTMLPurifier_Token_Text($data);
- return true;
- }
-
- /**
- * Escaped text handler, interface is defined by PEAR package.
- */
- public function escapeHandler(&$parser, $data) {
- if (strpos($data, '--') === 0) {
- // remove trailing and leading double-dashes
- $data = substr($data, 2);
- if (strlen($data) >= 2 && substr($data, -2) == "--") {
- $data = substr($data, 0, -2);
- }
- if (isset($this->stack[sizeof($this->stack) - 1]) &&
- $this->stack[sizeof($this->stack) - 1] == "style") {
- $this->tokens[] = new HTMLPurifier_Token_Text($data);
- } else {
- $this->tokens[] = new HTMLPurifier_Token_Comment($data);
- }
- $this->last_token_was_empty = false;
- }
- // CDATA is handled elsewhere, but if it was handled here:
- //if (strpos($data, '[CDATA[') === 0) {
- // $this->tokens[] = new HTMLPurifier_Token_Text(
- // substr($data, 7, strlen($data) - 9) );
- //}
- return true;
- }
-
- /**
- * An error handler that mutes strict errors
- */
- public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) {
- if ($errno == E_STRICT) return;
- return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext);
- }
-
-}
-
-// vim: et sw=4 sts=4
View
4 lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php
@@ -1113,7 +1113,7 @@ private function entity() {
$entity = $this->character($start, $this->char);
$cond = strlen($e_name) > 0;
- // The rest of the parsing happens below.
+ // The rest of the parsing happens bellow.
break;
// Anything else
@@ -1140,7 +1140,7 @@ private function entity() {
}
$cond = isset($entity);
- // The rest of the parsing happens below.
+ // The rest of the parsing happens bellow.
break;
}
View
2 lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php
@@ -11,8 +11,6 @@
*/
protected $strategies = array();
- abstract public function __construct();
-
public function execute($tokens, $config, $context) {
foreach ($this->strategies as $strategy) {
$tokens = $strategy->execute($tokens, $config, $context);
View
31 lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php
@@ -21,6 +21,9 @@ public function execute($tokens, $config, $context) {
// currently only used to determine if comments should be kept
$trusted = $config->get('HTML.Trusted');
+ $comment_lookup = $config->get('HTML.AllowedComments');
+ $comment_regexp = $config->get('HTML.AllowedCommentsRegexp');
+ $check_comments = $comment_lookup !== array() || $comment_regexp !== null;
$remove_script_contents = $config->get('Core.RemoveScriptContents');
$hidden_elements = $config->get('Core.HiddenElements');
@@ -128,23 +131,37 @@ public function execute($tokens, $config, $context) {
if ($textify_comments !== false) {
$data = $token->data;
$token = new HTMLPurifier_Token_Text($data);
- } elseif ($trusted) {
- // keep, but perform comment cleaning
+ } elseif ($trusted || $check_comments) {
+ // always cleanup comments
+ $trailing_hyphen = false;
if ($e) {
// perform check whether or not there's a trailing hyphen
if (substr($token->data, -1) == '-') {
- $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
+ $trailing_hyphen = true;
}
}
$token->data = rtrim($token->data, '-');
$found_double_hyphen = false;
while (strpos($token->data, '--') !== false) {
- if ($e && !$found_double_hyphen) {
- $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
- }
- $found_double_hyphen = true; // prevent double-erroring
+ $found_double_hyphen = true;
$token->data = str_replace('--', '-', $token->data);
}
+ if ($trusted || !empty($comment_lookup[trim($token->data)]) || ($comment_regexp !== NULL && preg_match($comment_regexp, trim($token->data)))) {
+ // OK good
+ if ($e) {
+ if ($trailing_hyphen) {
+ $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
+ }
+ if ($found_double_hyphen) {
+ $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
+ }
+ }
+ } else {
+ if ($e) {
+ $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
+ }
+ continue;
+ }
} else {
// strip comments
if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
View
40 lib/htmlpurifier/HTMLPurifier/URI.php
@@ -40,7 +40,7 @@ public function getSchemeObj($config, $context) {
} else {
// no scheme: retrieve the default one
$def = $config->getDefinition('URI');
- $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
+ $scheme_obj = $def->getDefaultScheme($config, $context);
if (!$scheme_obj) {
// something funky happened to the default scheme object
trigger_error(
@@ -199,6 +199,44 @@ public function toString() {
return $result;
}
+ /**
+ * Returns true if this URL might be considered a 'local' URL given
+ * the current context. This is true when the host is null, or
+ * when it matches the host supplied to the configuration.
+ *
+ * Note that this does not do any scheme checking, so it is mostly
+ * only appropriate for metadata that doesn't care about protocol
+ * security. isBenign is probably what you actually want.
+ */
+ public function isLocal($config, $context) {
+ if ($this->host === null) return true;
+ $uri_def = $config->getDefinition('URI');
+ if ($uri_def->host === $this->host) return true;
+ return false;
+ }
+
+ /**
+ * Returns true if this URL should be considered a 'benign' URL,
+ * that is:
+ *
+ * - It is a local URL (isLocal), and
+ * - It has a equal or better level of security
+ */
+ public function isBenign($config, $context) {
+ if (!$this->isLocal($config, $context)) return false;
+
+ $scheme_obj = $this->getSchemeObj($config, $context);
+ if (!$scheme_obj) return false; // conservative approach
+
+ $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context);
+ if ($current_scheme_obj->secure) {
+ if (!$scheme_obj->secure) {
+ return false;
+ }
+ }
+ return true;
+ }
+
}
// vim: et sw=4 sts=4
View
13 lib/htmlpurifier/HTMLPurifier/URIDefinition.php
@@ -27,6 +27,7 @@ public function __construct() {
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
$this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
+ $this->registerFilter(new HTMLPurifier_URIFilter_SafeIframe());
$this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
$this->registerFilter(new HTMLPurifier_URIFilter_Munge());
}
@@ -52,9 +53,13 @@ protected function doSetup($config) {
protected function setupFilters($config) {
foreach ($this->registeredFilters as $name => $filter) {
- $conf = $config->get('URI.' . $name);
- if ($conf !== false && $conf !== null) {
+ if ($filter->always_load) {
$this->addFilter($filter, $config);
+ } else {
+ $conf = $config->get('URI.' . $name);
+ if ($conf !== false && $conf !== null) {
+ $this->addFilter($filter, $config);
+ }
}
}
unset($this->registeredFilters);
@@ -72,6 +77,10 @@ protected function setupMemberVariables($config) {
if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
}
+ public function getDefaultScheme($config, $context) {
+ return HTMLPurifier_URISchemeRegistry::instance()->getScheme($this->defaultScheme, $config, $context);
+ }
+
public function filter(&$uri, $config, $context) {
foreach ($this->filters as $name => $f) {
$result = $f->filter($uri, $config, $context);
View
26 lib/htmlpurifier/HTMLPurifier/URIFilter.php
@@ -4,7 +4,21 @@
* Chainable filters for custom URI processing.
*
* These filters can perform custom actions on a URI filter object,
- * including transformation or blacklisting.
+ * including transformation or blacklisting. A filter named Foo
+ * must have a corresponding configuration directive %URI.Foo,
+ * unless always_load is specified to be true.