Skip to content

Commit

Permalink
Handle more cases with < folloed by character data (#705)
Browse files Browse the repository at this point in the history
This adds handling for two more cases:

1. something like "<word word". This throws an eof-in-attribute-name
   parser error.
2. something like "<word word=word". This throws an
   eof-in-attribute-value-no-quotes error.

Both of these work correctly now.
  • Loading branch information
willkg committed Oct 6, 2023
1 parent b56aa7c commit 11d8c9b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
15 changes: 11 additions & 4 deletions bleach/html5lib_shim.py
Expand Up @@ -395,10 +395,17 @@ def __iter__(self):
# followed by a series of characters. It's treated as a tag
# name that abruptly ends, but we should treat that like
# character data
yield {
"type": TAG_TOKEN_TYPE_CHARACTERS,
"data": "<" + self.currentToken["name"],
}
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
elif last_error_token["data"] in (
"eof-in-attribute-name",
"eof-in-attribute-value-no-quotes",
):
# Handle the case where the text being parsed ends with <
# followed by a series of characters and then space and then
# more characters. It's treated as a tag name followed by an
# attribute that abruptly ends, but we should treat that like
# character data.
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
else:
yield last_error_token

Expand Down
4 changes: 4 additions & 0 deletions tests/test_clean.py
Expand Up @@ -163,6 +163,10 @@ def test_bare_entities_get_escaped_correctly(text, expected):
("<y", "&lt;y"),
("x < y", "x &lt; y"),
("<y>", "&lt;y&gt;"),
# this is an eof-in-attribute-name parser error
("<some thing", "&lt;some thing"),
# this is an eof-in-attribute-value-no-quotes parser error
("<some thing=foo", "&lt;some thing=foo"),
],
)
def test_lessthan_escaping(text, expected):
Expand Down

0 comments on commit 11d8c9b

Please sign in to comment.