diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index e2dc66f0..92df8dec 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -42,6 +42,7 @@ constants.tokenTypes['EndTag'], constants.tokenTypes['EmptyTag'] } +TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag'] CHARACTERS_TYPE = constants.tokenTypes['Characters'] PARSEERROR_TYPE = constants.tokenTypes['ParseError'] @@ -164,6 +165,45 @@ ] +#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369 +#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements +HTML_TAGS__BLOCK_LEVEL = [ + 'address', + 'article', + 'aside', + 'blockquote', + 'details', + 'dialog', + 'dd', + 'div', + 'dl', + 'dt', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hgroup', + 'hr', + 'li', + 'main', + 'nav', + 'ol', + 'p', + 'pre', + 'section', + 'table', + 'ul', +] + + class InputStreamWithMemory(object): """Wraps an HTMLInputStream to remember characters since last < @@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs): # Wrap the stream with one that remembers the history self.stream = InputStreamWithMemory(self.stream) + # we need to remember the last token emitted, so we don't add too many spaces + _emittedLastToken = None + def __iter__(self): last_error_token = None @@ -335,9 +378,20 @@ def emitCurrentToken(self): # cases it gets converted to a Characters token. if self.parser.strip: # If we're stripping the token, we just throw in an empty - # string token. + # string token new_data = '' - + if ((self._emittedLastToken and + token['type'] == TAG_TOKEN_TYPE_START and + token['name'].lower() in HTML_TAGS__BLOCK_LEVEL + )): + _token_data = self._emittedLastToken.get('data', None) + if ((_token_data and + isinstance(_token_data, six.text_type) and + _token_data[-1] not in (' ', '\n', '\t') + )): + # BUT, if this is the START of a block level tag, then we + # want to insert a space for accessibility. + new_data = ' ' else: # If we're escaping the token, we want to escape the exact # original string. Since tokenizing also normalizes data @@ -351,11 +405,12 @@ def emitCurrentToken(self): 'data': new_data } - self.currentToken = new_token + self.currentToken = self._emittedLastToken = new_token self.tokenQueue.append(new_token) self.state = self.dataState return + self._emittedLastToken = self.currentToken super(BleachHTMLTokenizer, self).emitCurrentToken() diff --git a/tests/test_clean.py b/tests/test_clean.py index e306cc50..b1c08882 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -848,3 +848,12 @@ def __iter__(self): cleaner.clean(dirty) == 'this is cute! ' ) + + +def test_strip_respects_block_level_elements(): + """ + We should at least have a space between block level elements + https://github.com/mozilla/bleach/issues/369 + """ + text = '
Test!
Hello
' + assert clean(text, tags=[], strip=True) == 'Test! Hello'