Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix for issue #369 #460

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 58 additions & 3 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
constants.tokenTypes['EndTag'],
constants.tokenTypes['EmptyTag']
}
TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
CHARACTERS_TYPE = constants.tokenTypes['Characters']
PARSEERROR_TYPE = constants.tokenTypes['ParseError']

Expand Down Expand Up @@ -164,6 +165,45 @@
]


#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS__BLOCK_LEVEL = [
'address',
'article',
'aside',
'blockquote',
'details',
'dialog',
'dd',
'div',
'dl',
'dt',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'main',
'nav',
'ol',
'p',
'pre',
'section',
'table',
'ul',
]


class InputStreamWithMemory(object):
"""Wraps an HTMLInputStream to remember characters since last <

Expand Down Expand Up @@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs):
# Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream)

# we need to remember the last token emitted, so we don't add too many spaces
_emittedLastToken = None

def __iter__(self):
last_error_token = None

Expand Down Expand Up @@ -335,9 +378,20 @@ def emitCurrentToken(self):
# cases it gets converted to a Characters token.
if self.parser.strip:
# If we're stripping the token, we just throw in an empty
# string token.
# string token
new_data = ''

if ((self._emittedLastToken and
token['type'] == TAG_TOKEN_TYPE_START and
token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
)):
_token_data = self._emittedLastToken.get('data', None)
if ((_token_data and
isinstance(_token_data, six.text_type) and
_token_data[-1] not in (' ', '\n', '\t')
)):
# BUT, if this is the START of a block level tag, then we
# want to insert a space for accessibility.
new_data = ' '
else:
# If we're escaping the token, we want to escape the exact
# original string. Since tokenizing also normalizes data
Expand All @@ -351,11 +405,12 @@ def emitCurrentToken(self):
'data': new_data
}

self.currentToken = new_token
self.currentToken = self._emittedLastToken = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return

self._emittedLastToken = self.currentToken
super(BleachHTMLTokenizer, self).emitCurrentToken()


Expand Down
9 changes: 9 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,3 +848,12 @@ def __iter__(self):
cleaner.clean(dirty) ==
'this is cute! <img rel="moo" src="moo">'
)


def test_strip_respects_block_level_elements():
"""
We should at least have a space between block level elements
https://github.com/mozilla/bleach/issues/369
"""
text = '<p>Te<b>st</b>!</p><p>Hello</p>'
assert clean(text, tags=[], strip=True) == 'Test! Hello'