mozilla · jvanasco · Feb 11, 2022 · Feb 11, 2022 · Feb 11, 2022 · Feb 11, 2022
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -68,6 +68,8 @@
     constants.tokenTypes["EndTag"],
     constants.tokenTypes["EmptyTag"],
 }
+TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
+TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
 CHARACTERS_TYPE = constants.tokenTypes["Characters"]
 PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
 
@@ -190,6 +192,46 @@
 ]
 
 
+#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
+#: from mozilla on 2019.07.11
+#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
+HTML_TAGS__BLOCK_LEVEL = [
+    "address",
+    "article",
+    "aside",
+    "blockquote",
+    "details",
+    "dialog",
+    "dd",
+    "div",
+    "dl",
+    "dt",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "header",
+    "hgroup",
+    "hr",
+    "li",
+    "main",
+    "nav",
+    "ol",
+    "p",
+    "pre",
+    "section",
+    "table",
+    "ul",
+]
+
+
 class InputStreamWithMemory:
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -256,6 +298,9 @@ def start_tag(self):
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
 
+    # remember the last token emitted, needed for block element spacing
+    _emittedLastToken = None
+
     def __init__(self, consume_entities=False, **kwargs):
         super().__init__(**kwargs)
 
@@ -379,6 +424,12 @@ def emitCurrentToken(self):
                 # If we're stripping the token, we just throw in an empty
                 # string token.
                 new_data = ""
+                if (
+                    self._emittedLastToken
+                    and token["type"] == TAG_TOKEN_TYPE_START
+                    and token["name"].lower() in HTML_TAGS__BLOCK_LEVEL
+                ):
+                    new_data = "\n"
 
             else:
                 # If we're escaping the token, we want to escape the exact
@@ -390,11 +441,12 @@ def emitCurrentToken(self):
 
             new_token = {"type": CHARACTERS_TYPE, "data": new_data}
 
-            self.currentToken = new_token
+            self.currentToken = self._emittedLastToken = new_token
             self.tokenQueue.append(new_token)
             self.state = self.dataState
             return
 
+        self._emittedLastToken = self.currentToken
         super().emitCurrentToken()
 
 

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -1075,6 +1075,31 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
     )
 
 
+def test_strip_respects_block_level_elements():
+    """
+    Insert a newline between block level elements
+    https://github.com/mozilla/bleach/issues/369
+    """
+    # simple example
+    text = "<p>Te<b>st</b>!</p><p>Hello</p>"
+    assert clean(text, tags=[], strip=True) == "Test!\nHello"
+
+    # with an internal space and escaped character, just to be sure
+    text = "<p>This is our <b>description!</b> &amp;</p><p>nice!</p>"
+    assert clean(text, tags=[], strip=True) == "This is our description! &amp;\nnice!"
+
+    # a double-wrap causes an initial newline. this can"t really be handled under the current design
+    text = "<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>"
+    assert clean(text, tags=[], strip=True) == "\nThis is our description! &amp;\nnice!"
+
+    # newlines are used to keep lists and other elements readable
+    text = "<div><p>This is our <b>description!</b> &amp;</p><p>1</p><ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>"
+    assert (
+        clean(text, tags=[], strip=True)
+        == "\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!"
+    )
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory