From 46c7334ac20d106d9b7e0d1906692eedc9186eae Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Thu, 11 Jul 2019 16:36:33 -0400 Subject: [PATCH 1/4] fix for issue #369 --- bleach/html5lib_shim.py | 56 ++++++++++++++++++++++++++++++++++++++--- tests/test_clean.py | 9 +++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index e2dc66f0..e93f7abd 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -42,6 +42,7 @@ constants.tokenTypes['EndTag'], constants.tokenTypes['EmptyTag'] } +TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag'] CHARACTERS_TYPE = constants.tokenTypes['Characters'] PARSEERROR_TYPE = constants.tokenTypes['ParseError'] @@ -164,6 +165,45 @@ ] +#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369 +#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements +HTML_TAGS__BLOCK_LEVEL = [ + 'address', + 'article', + 'aside', + 'blockquote', + 'details', + 'dialog', + 'dd', + 'div', + 'dl', + 'dt', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hgroup', + 'hr', + 'li', + 'main', + 'nav', + 'ol', + 'p', + 'pre', + 'section', + 'table', + 'ul', +] + + class InputStreamWithMemory(object): """Wraps an HTMLInputStream to remember characters since last < @@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs): # Wrap the stream with one that remembers the history self.stream = InputStreamWithMemory(self.stream) + # we need to remember the last token emitted, so we don't add too many spaces + _emittedLastToken = None + def __iter__(self): last_error_token = None @@ -335,9 +378,15 @@ def emitCurrentToken(self): # cases it gets converted to a Characters token. if self.parser.strip: # If we're stripping the token, we just throw in an empty - # string token. + # string token new_data = '' - + if ((self._emittedLastToken and + token['type'] == TAG_TOKEN_TYPE_START and + token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and + not self._emittedLastToken.get('data', '').endswith(' '))): + # BUT, if this is the START of a block level tag, then we + # want to insert a space for accessibility. + new_data = ' ' else: # If we're escaping the token, we want to escape the exact # original string. Since tokenizing also normalizes data @@ -351,11 +400,12 @@ def emitCurrentToken(self): 'data': new_data } - self.currentToken = new_token + self.currentToken = self._emittedLastToken = new_token self.tokenQueue.append(new_token) self.state = self.dataState return + self._emittedLastToken = self.currentToken super(BleachHTMLTokenizer, self).emitCurrentToken() diff --git a/tests/test_clean.py b/tests/test_clean.py index e306cc50..b1c08882 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -848,3 +848,12 @@ def __iter__(self): cleaner.clean(dirty) == 'this is cute! ' ) + + +def test_strip_respects_block_level_elements(): + """ + We should at least have a space between block level elements + https://github.com/mozilla/bleach/issues/369 + """ + text = '

Test!

Hello

' + assert clean(text, tags=[], strip=True) == 'Test! Hello' From 7f7a90b71ceedc7f12310b3ce3a5ca4343835d0e Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 12 Jul 2019 13:36:44 -0400 Subject: [PATCH 2/4] handling case where token has attributes in data, not text --- bleach/html5lib_shim.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index e93f7abd..19dcf030 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -382,11 +382,14 @@ def emitCurrentToken(self): new_data = '' if ((self._emittedLastToken and token['type'] == TAG_TOKEN_TYPE_START and - token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and - not self._emittedLastToken.get('data', '').endswith(' '))): - # BUT, if this is the START of a block level tag, then we - # want to insert a space for accessibility. - new_data = ' ' + token['name'].lower() in HTML_TAGS__BLOCK_LEVEL + )): + _token_data = self._emittedLastToken.get('data', '') + if ((isinstance(_token_data, six.text_type) and + not _token_data.endswith(' '))): + # BUT, if this is the START of a block level tag, then we + # want to insert a space for accessibility. + new_data = ' ' else: # If we're escaping the token, we want to escape the exact # original string. Since tokenizing also normalizes data From 01fb08c695dfde7c519bb8727b39deb7c446a5c9 Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 12 Jul 2019 13:40:40 -0400 Subject: [PATCH 3/4] * extending last-character check to include newline and tab (was just space) * adjusting if-condition to have a lower-cpu check for `_token_data` first, before checking for it's type via `isinstance` --- bleach/html5lib_shim.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 19dcf030..7eb71cde 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -385,8 +385,10 @@ def emitCurrentToken(self): token['name'].lower() in HTML_TAGS__BLOCK_LEVEL )): _token_data = self._emittedLastToken.get('data', '') - if ((isinstance(_token_data, six.text_type) and - not _token_data.endswith(' '))): + if ((_token_data and + isinstance(_token_data, six.text_type) and + _token_data[-1] not in (' ', '\n', '\t') + )): # BUT, if this is the START of a block level tag, then we # want to insert a space for accessibility. new_data = ' ' From 45bb0c6b9ad226eb5e9fa8d79bee054fcd36773e Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 12 Jul 2019 13:42:27 -0400 Subject: [PATCH 4/4] casting _token_data to None by default as there is now a check against text types --- bleach/html5lib_shim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 7eb71cde..92df8dec 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -384,7 +384,7 @@ def emitCurrentToken(self): token['type'] == TAG_TOKEN_TYPE_START and token['name'].lower() in HTML_TAGS__BLOCK_LEVEL )): - _token_data = self._emittedLastToken.get('data', '') + _token_data = self._emittedLastToken.get('data', None) if ((_token_data and isinstance(_token_data, six.text_type) and _token_data[-1] not in (' ', '\n', '\t')