From a92f7b5828c613572e2deaf94c43f9e94f58f575 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 19 Jan 2021 14:33:51 -0500
Subject: [PATCH 01/10] Try to recover from unknown character encodings.

---
 synapse/rest/media/v1/preview_url_resource.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index a632099167fc..0e1c003211ca 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -708,6 +708,11 @@ def decode_and_calc_og(
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
         tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
         og = _calc_og(tree, media_uri)
+    except LookupError:
+        # blindly try decoding the body as utf-8.
+        parser = etree.HTMLParser(recover=True, encoding="utf-8")
+        tree = etree.fromstring(body, parser)
+        og = _calc_og(tree, media_uri)
 
     return og
 

From 3dec5b83923cb27aee83e98406b3ca021869128f Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 19 Jan 2021 14:34:03 -0500
Subject: [PATCH 02/10] Share some common code.

---
 synapse/rest/media/v1/preview_url_resource.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 0e1c003211ca..73897435621c 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -701,20 +701,17 @@ def decode_and_calc_og(
     try:
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
         tree = etree.fromstring(body, parser)
-        og = _calc_og(tree, media_uri)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
         tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
-        og = _calc_og(tree, media_uri)
     except LookupError:
         # blindly try decoding the body as utf-8.
         parser = etree.HTMLParser(recover=True, encoding="utf-8")
         tree = etree.fromstring(body, parser)
-        og = _calc_og(tree, media_uri)
 
-    return og
+    return _calc_og(tree, media_uri)
 
 
 def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:

From 4d255b6b5be5d3604b03a861a699a5bcdf082528 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 19 Jan 2021 14:37:45 -0500
Subject: [PATCH 03/10] Newsfragment

---
 changelog.d/9164.bugfix | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/9164.bugfix

diff --git a/changelog.d/9164.bugfix b/changelog.d/9164.bugfix
new file mode 100644
index 000000000000..48f95be97481
--- /dev/null
+++ b/changelog.d/9164.bugfix
@@ -0,0 +1 @@
+Fix a longstanding bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.

From 89a43fecf0dd0e852aec9e95d86c96b7129f3ed1 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Thu, 21 Jan 2021 08:44:51 -0500
Subject: [PATCH 04/10] More error handling.

---
 synapse/rest/media/v1/preview_url_resource.py | 42 ++++++++++++++++---
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 73897435621c..093d0bd9bd28 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -692,24 +692,54 @@ async def _expire_url_cache_data(self) -> None:
 def decode_and_calc_og(
     body: bytes, media_uri: str, request_encoding: Optional[str] = None
 ) -> Dict[str, Optional[str]]:
+    """
+    Calculate metadata for an HTML document.
+
+    This uses lxml to parse the HTML document into the OG response. If errors
+    occur during processing of the document, an empty response is returned.
+
+    Params:
+        body: The HTML document, as bytes.
+        media_url: The URI used to download the body.
+        request_encoding: The character encoding of the body, as a string.
+
+    Returns:
+        The OG response as a dictionary.
+    """
     # If there's no body, nothing useful is going to be found.
     if not body:
         return {}
 
     from lxml import etree
 
+    # Create an HTML parser. If this fails, log and return no metadata.
     try:
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
+    except LookupError:
+        # blindly consider the encoding as utf-8.
+        try:
+            parser = etree.HTMLParser(recover=True, encoding="utf-8")
+        except Exception as e:
+            logger.warning("Unable to create fallback HTML parser: %s" % (e,))
+            return {}
+    except Exception as e:
+        logger.warning("Unable to create HTML parser: %s" % (e,))
+        return {}
+
+    # Attempt to parse the body. If this fails, log and return no metadata.
+    try:
         tree = etree.fromstring(body, parser)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
-        parser = etree.HTMLParser(recover=True, encoding=request_encoding)
-        tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
-    except LookupError:
-        # blindly try decoding the body as utf-8.
-        parser = etree.HTMLParser(recover=True, encoding="utf-8")
-        tree = etree.fromstring(body, parser)
+        try:
+            tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
+        except Exception as e:
+            logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,))
+            return {}
+    except Exception as e:
+        logger.warning("Failed to parse HTML body: %s" % (e,))
+        return {}
 
     return _calc_og(tree, media_uri)
 

From fd51df96e31c57a7280c570e2312632e137ad8f2 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Thu, 21 Jan 2021 08:54:55 -0500
Subject: [PATCH 05/10] Add tests.

---
 synapse/rest/media/v1/preview_url_resource.py | 10 +++----
 tests/test_preview.py                         | 27 +++++++++++++++++++
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 093d0bd9bd28..f7b70098c2f3 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -717,11 +717,7 @@ def decode_and_calc_og(
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
     except LookupError:
         # blindly consider the encoding as utf-8.
-        try:
-            parser = etree.HTMLParser(recover=True, encoding="utf-8")
-        except Exception as e:
-            logger.warning("Unable to create fallback HTML parser: %s" % (e,))
-            return {}
+        parser = etree.HTMLParser(recover=True, encoding="utf-8")
     except Exception as e:
         logger.warning("Unable to create HTML parser: %s" % (e,))
         return {}
@@ -729,11 +725,13 @@ def decode_and_calc_og(
     # Attempt to parse the body. If this fails, log and return no metadata.
     try:
         tree = etree.fromstring(body, parser)
+        return _calc_og(tree, media_uri)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
         try:
             tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
+            return _calc_og(tree, media_uri)
         except Exception as e:
             logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,))
             return {}
@@ -741,8 +739,6 @@ def decode_and_calc_og(
         logger.warning("Failed to parse HTML body: %s" % (e,))
         return {}
 
-    return _calc_og(tree, media_uri)
-
 
 def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
     # suck our tree into lxml and define our OG response.
diff --git a/tests/test_preview.py b/tests/test_preview.py
index c19facc1cb70..0bc324beca8d 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -261,3 +261,30 @@ def test_empty(self):
         html = ""
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {})
+
+    def test_invalid_encoding(self):
+        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
+        html = """
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding")
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding2(self):
+        """A body which doesn't match the sent character encoding."""
+        # Note that this contains an invalid UTF-8 sequence in the title.
+        html = b"""
+        <html>
+        <head><title>\xff\xff Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(html, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

From 2b02f444419f4f8b674cdb3816d3972704d18dd4 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Thu, 21 Jan 2021 08:55:45 -0500
Subject: [PATCH 06/10] Typo

---
 changelog.d/9164.bugfix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changelog.d/9164.bugfix b/changelog.d/9164.bugfix
index 48f95be97481..1c54a256c1b5 100644
--- a/changelog.d/9164.bugfix
+++ b/changelog.d/9164.bugfix
@@ -1 +1 @@
-Fix a longstanding bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.
+Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.

From 465c8e28a57ce0c3a1314ae819f4550e891fabaa Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Thu, 21 Jan 2021 11:34:51 -0500
Subject: [PATCH 07/10] Lint.

---
 tests/test_preview.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_preview.py b/tests/test_preview.py
index 0bc324beca8d..0c6cbbd92148 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -272,7 +272,9 @@ def test_invalid_encoding(self):
         </body>
         </html>
         """
-        og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding")
+        og = decode_and_calc_og(
+            html, "http://example.com/test.html", "invalid-encoding"
+        )
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_invalid_encoding2(self):

From 58abad9bbb0abf4e409ef286ea3cb63fc2653715 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Thu, 21 Jan 2021 13:57:33 -0500
Subject: [PATCH 08/10] Params -> Args

---
 synapse/rest/media/v1/preview_url_resource.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index f7b70098c2f3..0dbbcead0990 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -386,7 +386,7 @@ def _get_oembed_url(self, url: str) -> Optional[str]:
         """
         Check whether the URL should be downloaded as oEmbed content instead.
 
-        Params:
+        Args:
             url: The URL to check.
 
         Returns:
@@ -403,7 +403,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
         """
         Request content from an oEmbed endpoint.
 
-        Params:
+        Args:
             endpoint: The oEmbed API endpoint.
             url: The URL to pass to the API.
 
@@ -698,7 +698,7 @@ def decode_and_calc_og(
     This uses lxml to parse the HTML document into the OG response. If errors
     occur during processing of the document, an empty response is returned.
 
-    Params:
+    Args:
         body: The HTML document, as bytes.
         media_url: The URI used to download the body.
         request_encoding: The character encoding of the body, as a string.

From 911eec628b6ace6884a544407dbc01eea6ebe90b Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 25 Jan 2021 14:37:15 -0500
Subject: [PATCH 09/10] Add a missing type hint.

---
 synapse/rest/media/v1/preview_url_resource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 0dbbcead0990..84a73422329e 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -740,7 +740,7 @@ def decode_and_calc_og(
         return {}
 
 
-def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
+def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
     # suck our tree into lxml and define our OG response.
 
     # if we see any image URLs in the OG response, then spider them

From f629933cd040ae6534333eccc7a719404c7f5d3c Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 25 Jan 2021 14:44:11 -0500
Subject: [PATCH 10/10] Try to simplify code.

---
 synapse/rest/media/v1/preview_url_resource.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 84a73422329e..bf3be653aa6c 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -722,22 +722,18 @@ def decode_and_calc_og(
         logger.warning("Unable to create HTML parser: %s" % (e,))
         return {}
 
+    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
+        # Attempt to parse the body. If this fails, log and return no metadata.
+        tree = etree.fromstring(body_attempt, parser)
+        return _calc_og(tree, media_uri)
+
     # Attempt to parse the body. If this fails, log and return no metadata.
     try:
-        tree = etree.fromstring(body, parser)
-        return _calc_og(tree, media_uri)
+        return _attempt_calc_og(body)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
-        try:
-            tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
-            return _calc_og(tree, media_uri)
-        except Exception as e:
-            logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,))
-            return {}
-    except Exception as e:
-        logger.warning("Failed to parse HTML body: %s" % (e,))
-        return {}
+        return _attempt_calc_og(body.decode("utf-8", "ignore"))
 
 
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: