Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Redo of #603 against main / 4.x #642

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
54 changes: 53 additions & 1 deletion bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"],
}
TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]

Expand Down Expand Up @@ -190,6 +192,46 @@
]


#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS__BLOCK_LEVEL = [
"address",
"article",
"aside",
"blockquote",
"details",
"dialog",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"ul",
]


class InputStreamWithMemory:
"""Wraps an HTMLInputStream to remember characters since last <

Expand Down Expand Up @@ -256,6 +298,9 @@ def start_tag(self):
class BleachHTMLTokenizer(HTMLTokenizer):
"""Tokenizer that doesn't consume character entities"""

# remember the last token emitted, needed for block element spacing
_emittedLastToken = None

def __init__(self, consume_entities=False, **kwargs):
super().__init__(**kwargs)

Expand Down Expand Up @@ -379,6 +424,12 @@ def emitCurrentToken(self):
# If we're stripping the token, we just throw in an empty
# string token.
new_data = ""
if (
self._emittedLastToken
and token["type"] == TAG_TOKEN_TYPE_START
and token["name"].lower() in HTML_TAGS__BLOCK_LEVEL
):
new_data = "\n"

else:
# If we're escaping the token, we want to escape the exact
Expand All @@ -390,11 +441,12 @@ def emitCurrentToken(self):

new_token = {"type": CHARACTERS_TYPE, "data": new_data}

self.currentToken = new_token
self.currentToken = self._emittedLastToken = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return

self._emittedLastToken = self.currentToken
super().emitCurrentToken()


Expand Down
25 changes: 25 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,31 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
)


def test_strip_respects_block_level_elements():
"""
Insert a newline between block level elements
https://github.com/mozilla/bleach/issues/369
"""
# simple example
text = "<p>Te<b>st</b>!</p><p>Hello</p>"
assert clean(text, tags=[], strip=True) == "Test!\nHello"

# with an internal space and escaped character, just to be sure
text = "<p>This is our <b>description!</b> &amp;</p><p>nice!</p>"
assert clean(text, tags=[], strip=True) == "This is our description! &amp;\nnice!"

# a double-wrap causes an initial newline. this can"t really be handled under the current design
text = "<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>"
assert clean(text, tags=[], strip=True) == "\nThis is our description! &amp;\nnice!"

# newlines are used to keep lists and other elements readable
text = "<div><p>This is our <b>description!</b> &amp;</p><p>1</p><ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>"
assert (
clean(text, tags=[], strip=True)
== "\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!"
)


def get_ids_and_tests():
"""Retrieves regression tests from data/ directory

Expand Down