From ed06d4e56b70e08fae2dd8f13b6a1955cf106029 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 1 Jun 2022 20:56:47 -0400 Subject: [PATCH] Handle escaping < in edge cases where it doesn't start a tag (#544) (#667) The html5lib tokenizer kicks up a parse error token when there's a < that isn't the start of a tag. This adds some handling for that case and treats the < plus whatever is after it as characters data. --- bleach/html5lib_shim.py | 12 +++++++++++- tests/test_clean.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 6fc90485..d121953b 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -385,7 +385,17 @@ def __iter__(self): yield token if last_error_token: - yield last_error_token + if last_error_token["data"] == "eof-in-tag-name": + # Handle the case where the text being parsed ends with < + # followed by a series of characters. It's treated as a tag + # name that abruptly ends, but we should treat that like + # character data + yield { + "type": TAG_TOKEN_TYPE_CHARACTERS, + "data": "<" + self.currentToken["name"], + } + else: + yield last_error_token def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the diff --git a/tests/test_clean.py b/tests/test_clean.py index b9c262ab..ab112536 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -156,6 +156,22 @@ def test_bare_entities_get_escaped_correctly(text, expected): assert clean(text) == expected +@pytest.mark.parametrize( + "text, expected", + [ + ("x", "<y>"), + ], +) +def test_lessthan_escaping(text, expected): + # Tests whether < gets escaped correctly in a series of edge cases where + # the html5lib tokenizer hits an error because it's not the beginning of a + # tag. + assert clean(text) == expected + + @pytest.mark.parametrize( "text, expected", [