diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 6fc90485..d121953b 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -385,7 +385,17 @@ def __iter__(self): yield token if last_error_token: - yield last_error_token + if last_error_token["data"] == "eof-in-tag-name": + # Handle the case where the text being parsed ends with < + # followed by a series of characters. It's treated as a tag + # name that abruptly ends, but we should treat that like + # character data + yield { + "type": TAG_TOKEN_TYPE_CHARACTERS, + "data": "<" + self.currentToken["name"], + } + else: + yield last_error_token def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the diff --git a/tests/test_clean.py b/tests/test_clean.py index b9c262ab..ab112536 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -156,6 +156,22 @@ def test_bare_entities_get_escaped_correctly(text, expected): assert clean(text) == expected +@pytest.mark.parametrize( + "text, expected", + [ + ("x", "<y>"), + ], +) +def test_lessthan_escaping(text, expected): + # Tests whether < gets escaped correctly in a series of edge cases where + # the html5lib tokenizer hits an error because it's not the beginning of a + # tag. + assert clean(text) == expected + + @pytest.mark.parametrize( "text, expected", [