From 46c7334ac20d106d9b7e0d1906692eedc9186eae Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Thu, 11 Jul 2019 16:36:33 -0400
Subject: [PATCH 1/4] fix for issue #369

---
 bleach/html5lib_shim.py | 56 ++++++++++++++++++++++++++++++++++++++---
 tests/test_clean.py     |  9 +++++++
 2 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index e2dc66f0..e93f7abd 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -42,6 +42,7 @@
     constants.tokenTypes['EndTag'],
     constants.tokenTypes['EmptyTag']
 }
+TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 PARSEERROR_TYPE = constants.tokenTypes['ParseError']
 
@@ -164,6 +165,45 @@
 ]
 
 
+#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369
+#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
+HTML_TAGS__BLOCK_LEVEL = [
+    'address',
+    'article',
+    'aside',
+    'blockquote',
+    'details',
+    'dialog',
+    'dd',
+    'div',
+    'dl',
+    'dt',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'header',
+    'hgroup',
+    'hr',
+    'li',
+    'main',
+    'nav',
+    'ol',
+    'p',
+    'pre',
+    'section',
+    'table',
+    'ul',
+]
+
+
 class InputStreamWithMemory(object):
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs):
         # Wrap the stream with one that remembers the history
         self.stream = InputStreamWithMemory(self.stream)
 
+    # we need to remember the last token emitted, so we don't add too many spaces
+    _emittedLastToken = None
+
     def __iter__(self):
         last_error_token = None
 
@@ -335,9 +378,15 @@ def emitCurrentToken(self):
             # cases it gets converted to a Characters token.
             if self.parser.strip:
                 # If we're stripping the token, we just throw in an empty
-                # string token.
+                # string token
                 new_data = ''
-
+                if ((self._emittedLastToken and
+                     token['type'] == TAG_TOKEN_TYPE_START and
+                     token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and
+                     not self._emittedLastToken.get('data', '').endswith(' '))):
+                    # BUT, if this is the START of a block level tag, then we
+                    # want to insert a space for accessibility.
+                    new_data = ' '
             else:
                 # If we're escaping the token, we want to escape the exact
                 # original string. Since tokenizing also normalizes data
@@ -351,11 +400,12 @@ def emitCurrentToken(self):
                 'data': new_data
             }
 
-            self.currentToken = new_token
+            self.currentToken = self._emittedLastToken = new_token
             self.tokenQueue.append(new_token)
             self.state = self.dataState
             return
 
+        self._emittedLastToken = self.currentToken
         super(BleachHTMLTokenizer, self).emitCurrentToken()
 
 
diff --git a/tests/test_clean.py b/tests/test_clean.py
index e306cc50..b1c08882 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -848,3 +848,12 @@ def __iter__(self):
             cleaner.clean(dirty) ==
             'this is cute! <img rel="moo" src="moo">'
         )
+
+
+def test_strip_respects_block_level_elements():
+    """
+    We should at least have a space between block level elements
+    https://github.com/mozilla/bleach/issues/369
+    """
+    text = '<p>Te<b>st</b>!</p><p>Hello</p>'
+    assert clean(text, tags=[], strip=True) == 'Test! Hello'

From 7f7a90b71ceedc7f12310b3ce3a5ca4343835d0e Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 12 Jul 2019 13:36:44 -0400
Subject: [PATCH 2/4] handling case where token has attributes in data, not
 text

---
 bleach/html5lib_shim.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index e93f7abd..19dcf030 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -382,11 +382,14 @@ def emitCurrentToken(self):
                 new_data = ''
                 if ((self._emittedLastToken and
                      token['type'] == TAG_TOKEN_TYPE_START and
-                     token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and
-                     not self._emittedLastToken.get('data', '').endswith(' '))):
-                    # BUT, if this is the START of a block level tag, then we
-                    # want to insert a space for accessibility.
-                    new_data = ' '
+                     token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
+                     )):
+                    _token_data = self._emittedLastToken.get('data', '')
+                    if ((isinstance(_token_data, six.text_type) and
+                         not _token_data.endswith(' '))):
+                        # BUT, if this is the START of a block level tag, then we
+                        # want to insert a space for accessibility.
+                        new_data = ' '
             else:
                 # If we're escaping the token, we want to escape the exact
                 # original string. Since tokenizing also normalizes data

From 01fb08c695dfde7c519bb8727b39deb7c446a5c9 Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 12 Jul 2019 13:40:40 -0400
Subject: [PATCH 3/4] * extending last-character check to include newline and
 tab (was just space) * adjusting if-condition to have a lower-cpu check for
 `_token_data` first, before checking for it's type via `isinstance`

---
 bleach/html5lib_shim.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 19dcf030..7eb71cde 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -385,8 +385,10 @@ def emitCurrentToken(self):
                      token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
                      )):
                     _token_data = self._emittedLastToken.get('data', '')
-                    if ((isinstance(_token_data, six.text_type) and
-                         not _token_data.endswith(' '))):
+                    if ((_token_data and
+                         isinstance(_token_data, six.text_type) and
+                         _token_data[-1] not in (' ', '\n', '\t')
+                         )):
                         # BUT, if this is the START of a block level tag, then we
                         # want to insert a space for accessibility.
                         new_data = ' '

From 45bb0c6b9ad226eb5e9fa8d79bee054fcd36773e Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 12 Jul 2019 13:42:27 -0400
Subject: [PATCH 4/4] casting _token_data to None by default as there is now a
 check against text types

---
 bleach/html5lib_shim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 7eb71cde..92df8dec 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -384,7 +384,7 @@ def emitCurrentToken(self):
                      token['type'] == TAG_TOKEN_TYPE_START and
                      token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
                      )):
-                    _token_data = self._emittedLastToken.get('data', '')
+                    _token_data = self._emittedLastToken.get('data', None)
                     if ((_token_data and
                          isinstance(_token_data, six.text_type) and
                          _token_data[-1] not in (' ', '\n', '\t')