From ed06d4e56b70e08fae2dd8f13b6a1955cf106029 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@users.noreply.github.com>
Date: Wed, 1 Jun 2022 20:56:47 -0400
Subject: [PATCH] Handle escaping < in edge cases where it doesn't start a tag
 (#544) (#667)

The html5lib tokenizer kicks up a parse error token when there's a <
that isn't the start of a tag. This adds some handling for that case and
treats the < plus whatever is after it as characters data.
---
 bleach/html5lib_shim.py | 12 +++++++++++-
 tests/test_clean.py     | 16 ++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 6fc90485..d121953b 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -385,7 +385,17 @@ def __iter__(self):
             yield token
 
         if last_error_token:
-            yield last_error_token
+            if last_error_token["data"] == "eof-in-tag-name":
+                # Handle the case where the text being parsed ends with <
+                # followed by a series of characters. It's treated as a tag
+                # name that abruptly ends, but we should treat that like
+                # character data
+                yield {
+                    "type": TAG_TOKEN_TYPE_CHARACTERS,
+                    "data": "<" + self.currentToken["name"],
+                }
+            else:
+                yield last_error_token
 
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # If this tokenizer is set to consume entities, then we can let the
diff --git a/tests/test_clean.py b/tests/test_clean.py
index b9c262ab..ab112536 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -156,6 +156,22 @@ def test_bare_entities_get_escaped_correctly(text, expected):
     assert clean(text) == expected
 
 
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("x<y", "x&lt;y"),
+        ("<y", "&lt;y"),
+        ("x < y", "x &lt; y"),
+        ("<y>", "&lt;y&gt;"),
+    ],
+)
+def test_lessthan_escaping(text, expected):
+    # Tests whether < gets escaped correctly in a series of edge cases where
+    # the html5lib tokenizer hits an error because it's not the beginning of a
+    # tag.
+    assert clean(text) == expected
+
+
 @pytest.mark.parametrize(
     "text, expected",
     [