Skip to content
Permalink
Browse files Browse the repository at this point in the history
Merge pull request #356 from willkg/fix-entities
Fix url sanitizing
  • Loading branch information
willkg committed Mar 5, 2018
2 parents e7f83b8 + 61bf0e6 commit c5df578
Show file tree
Hide file tree
Showing 2 changed files with 210 additions and 22 deletions.
134 changes: 116 additions & 18 deletions bleach/sanitizer.py
Expand Up @@ -4,6 +4,7 @@
import string

import six
from six.moves.urllib.parse import urlparse
from xml.sax.saxutils import unescape

import html5lib
Expand All @@ -27,8 +28,11 @@
from bleach.utils import alphabetize_attributes, force_unicode


#: Map of entity name to expanded entity
ENTITIES = entities

#: Trie of html entity string -> character representation
ENTITIES_TRIE = Trie(entities)
ENTITIES_TRIE = Trie(ENTITIES)

#: List of allowed tags
ALLOWED_TAGS = [
Expand Down Expand Up @@ -79,13 +83,61 @@
INVISIBLE_REPLACEMENT_CHAR = '?'


def convert_entity(value):
"""Convert an entity (minus the & and ; part) into what it represents
This handles numeric, hex, and text entities.
:arg value: the string (minus the ``&`` and ``;`` part) to convert
:returns: unicode character
"""
if value[0] == '#':
if value[1] in ('x', 'X'):
return six.unichr(int(value[2:], 16))
return six.unichr(int(value[1:], 10))

return ENTITIES[value]


def convert_entities(text):
"""Converts all found entities in the text
:arg text: the text to convert entities in
:returns: unicode text with converted entities
"""
if '&' not in text:
return text

new_text = []
for part in next_possible_entity(text):
if not part:
continue

if part.startswith('&'):
entity = match_entity(part)
if entity is not None:
new_text.append(convert_entity(entity))
remainder = part[len(entity) + 2:]
if part:
new_text.append(remainder)
continue

new_text.append(part)

return u''.join(new_text)


class BleachHTMLTokenizer(HTMLTokenizer):
def consumeEntity(self, allowedChar=None, fromAttribute=False):
# We don't want to consume and convert entities, so this overrides the
# html5lib tokenizer's consumeEntity so that it's now a no-op.
#
# However, when that gets called, it's consumed an &, so we put that in
# the steam.
# the stream.
if fromAttribute:
self.currentToken['data'][-1][1] += '&'

Expand Down Expand Up @@ -479,15 +531,69 @@ def sanitize_characters(self, token):
new_tokens.append({'type': 'Entity', 'name': entity})
# Length of the entity plus 2--one for & at the beginning
# and and one for ; at the end
part = part[len(entity) + 2:]
if part:
new_tokens.append({'type': 'Characters', 'data': part})
remainder = part[len(entity) + 2:]
if remainder:
new_tokens.append({'type': 'Characters', 'data': remainder})
continue

new_tokens.append({'type': 'Characters', 'data': part})

return new_tokens

def sanitize_uri_value(self, value, allowed_protocols):
"""Checks a uri value to see if it's allowed
:arg value: the uri value to sanitize
:arg allowed_protocols: list of allowed protocols
:returns: allowed value or None
"""
# NOTE(willkg): This transforms the value into one that's easier to
# match and verify, but shouldn't get returned since it's vastly
# different than the original value.

# Convert all character entities in the value
new_value = convert_entities(value)

# Nix backtick, space characters, and control characters
new_value = re.sub(
"[`\000-\040\177-\240\s]+",
'',
new_value
)

# Remove REPLACEMENT characters
new_value = new_value.replace('\ufffd', '')

# Lowercase it--this breaks the value, but makes it easier to match
# against
new_value = new_value.lower()

# Drop attributes with uri values that have protocols that aren't
# allowed
parsed = urlparse(new_value)
if parsed.scheme:
# If urlparse found a scheme, check that
if parsed.scheme in allowed_protocols:
return value

else:
# Allow uris that are just an anchor
if new_value.startswith('#'):
return value

# Handle protocols that urlparse doesn't recognize like "myprotocol"
if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
return value

# If there's no protocol/scheme specified, then assume it's "http"
# and see if that's allowed
if 'http' in allowed_protocols:
return value

return None

def allow_token(self, token):
"""Handles the case where we're allowing the tag"""
if 'data' in token:
Expand All @@ -508,21 +614,13 @@ def allow_token(self, token):
if not self.attr_filter(token['name'], name, val):
continue

# Look at attributes that have uri values
# Drop attributes with uri values that use a disallowed protocol
# Sanitize attributes with uri values
if namespaced_name in self.attr_val_is_uri:
val_unescaped = re.sub(
"[`\000-\040\177-\240\s]+",
'',
unescape(val)).lower()

# Remove replacement characters from unescaped characters.
val_unescaped = val_unescaped.replace("\ufffd", "")

# Drop attributes with uri values that have protocols that
# aren't allowed
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
(val_unescaped.split(':')[0] not in self.allowed_protocols)):
new_value = self.sanitize_uri_value(val, self.allowed_protocols)
if new_value is None:
continue
val = new_value

# Drop values in svg attrs with non-local IRIs
if namespaced_name in self.svg_attr_val_allows_ref:
Expand Down
98 changes: 94 additions & 4 deletions tests/test_clean.py
Expand Up @@ -213,7 +213,7 @@ def test_nested_script_tag():
('an < entity', 'an &lt; entity'),
('tag < <em>and</em> entity', 'tag &lt; <em>and</em> entity'),
])
def test_bare_entities(text, expected):
def test_bare_entities_get_escaped_correctly(text, expected):
assert clean(text) == expected


Expand Down Expand Up @@ -277,7 +277,7 @@ def test_bare_entities(text, expected):
# Verify that clean() doesn't unescape entities.
('&#39;&#34;', '&#39;&#34;'),
])
def test_character_entities(text, expected):
def test_character_entities_handling(text, expected):
assert clean(text) == expected


Expand Down Expand Up @@ -534,10 +534,100 @@ def test_attributes_list():
# Unspecified protocols are not allowed
(
'<a href="http://xx.com">invalid href</a>',
'<a href="http://example.com">invalid href</a>',
{'protocols': ['myprotocol']},
'<a>invalid href</a>'
)
),
# Anchors are ok
(
'<a href="#example.com">foo</a>',
{'protocols': []},
'<a href="#example.com">foo</a>'
),
# Allow implicit http if allowed
(
'<a href="example.com">valid</a>',
{'protocols': ['http']},
'<a href="example.com">valid</a>'
),
(
'<a href="example.com:8000">valid</a>',
{'protocols': ['http']},
'<a href="example.com:8000">valid</a>'
),
(
'<a href="localhost">valid</a>',
{'protocols': ['http']},
'<a href="localhost">valid</a>'
),
(
'<a href="localhost:8000">valid</a>',
{'protocols': ['http']},
'<a href="localhost:8000">valid</a>'
),
(
'<a href="192.168.100.100">valid</a>',
{'protocols': ['http']},
'<a href="192.168.100.100">valid</a>'
),
(
'<a href="192.168.100.100:8000">valid</a>',
{'protocols': ['http']},
'<a href="192.168.100.100:8000">valid</a>'
),
# Disallow implicit http if disallowed
(
'<a href="example.com">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
(
'<a href="example.com:8000">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
(
'<a href="localhost">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
(
'<a href="localhost:8000">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
(
'<a href="192.168.100.100">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
(
'<a href="192.168.100.100:8000">foo</a>',
{'protocols': []},
'<a>foo</a>'
),
# Disallowed protocols with sneaky character entities
(
'<a href="javas&#x09;cript:alert(1)">alert</a>',
{},
'<a>alert</a>'
),
(
'<a href="&#14;javascript:alert(1)">alert</a>',
{},
'<a>alert</a>'
),
# Checking the uri should change it at all
(
'<a href="http://example.com/?foo&nbsp;bar">foo</a>',
{},
'<a href="http://example.com/?foo&nbsp;bar">foo</a>'
),
])
def test_uri_value_allowed_protocols(data, kwargs, expected):
assert clean(data, **kwargs) == expected
Expand Down

0 comments on commit c5df578

Please sign in to comment.