From a92f7b5828c613572e2deaf94c43f9e94f58f575 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 19 Jan 2021 14:33:51 -0500 Subject: [PATCH 01/10] Try to recover from unknown character encodings. --- synapse/rest/media/v1/preview_url_resource.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a632099167fc..0e1c003211ca 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -708,6 +708,11 @@ def decode_and_calc_og( parser = etree.HTMLParser(recover=True, encoding=request_encoding) tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) og = _calc_og(tree, media_uri) + except LookupError: + # blindly try decoding the body as utf-8. + parser = etree.HTMLParser(recover=True, encoding="utf-8") + tree = etree.fromstring(body, parser) + og = _calc_og(tree, media_uri) return og From 3dec5b83923cb27aee83e98406b3ca021869128f Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 19 Jan 2021 14:34:03 -0500 Subject: [PATCH 02/10] Share some common code. --- synapse/rest/media/v1/preview_url_resource.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0e1c003211ca..73897435621c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -701,20 +701,17 @@ def decode_and_calc_og( try: parser = etree.HTMLParser(recover=True, encoding=request_encoding) tree = etree.fromstring(body, parser) - og = _calc_og(tree, media_uri) except UnicodeDecodeError: # blindly try decoding the body as utf-8, which seems to fix # the charset mismatches on https://google.com parser = etree.HTMLParser(recover=True, encoding=request_encoding) tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) - og = _calc_og(tree, media_uri) except LookupError: # blindly try decoding the body as utf-8. parser = etree.HTMLParser(recover=True, encoding="utf-8") tree = etree.fromstring(body, parser) - og = _calc_og(tree, media_uri) - return og + return _calc_og(tree, media_uri) def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]: From 4d255b6b5be5d3604b03a861a699a5bcdf082528 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 19 Jan 2021 14:37:45 -0500 Subject: [PATCH 03/10] Newsfragment --- changelog.d/9164.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/9164.bugfix diff --git a/changelog.d/9164.bugfix b/changelog.d/9164.bugfix new file mode 100644 index 000000000000..48f95be97481 --- /dev/null +++ b/changelog.d/9164.bugfix @@ -0,0 +1 @@ +Fix a longstanding bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding. From 89a43fecf0dd0e852aec9e95d86c96b7129f3ed1 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 21 Jan 2021 08:44:51 -0500 Subject: [PATCH 04/10] More error handling. --- synapse/rest/media/v1/preview_url_resource.py | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 73897435621c..093d0bd9bd28 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -692,24 +692,54 @@ async def _expire_url_cache_data(self) -> None: def decode_and_calc_og( body: bytes, media_uri: str, request_encoding: Optional[str] = None ) -> Dict[str, Optional[str]]: + """ + Calculate metadata for an HTML document. + + This uses lxml to parse the HTML document into the OG response. If errors + occur during processing of the document, an empty response is returned. + + Params: + body: The HTML document, as bytes. + media_url: The URI used to download the body. + request_encoding: The character encoding of the body, as a string. + + Returns: + The OG response as a dictionary. + """ # If there's no body, nothing useful is going to be found. if not body: return {} from lxml import etree + # Create an HTML parser. If this fails, log and return no metadata. try: parser = etree.HTMLParser(recover=True, encoding=request_encoding) + except LookupError: + # blindly consider the encoding as utf-8. + try: + parser = etree.HTMLParser(recover=True, encoding="utf-8") + except Exception as e: + logger.warning("Unable to create fallback HTML parser: %s" % (e,)) + return {} + except Exception as e: + logger.warning("Unable to create HTML parser: %s" % (e,)) + return {} + + # Attempt to parse the body. If this fails, log and return no metadata. + try: tree = etree.fromstring(body, parser) except UnicodeDecodeError: # blindly try decoding the body as utf-8, which seems to fix # the charset mismatches on https://google.com - parser = etree.HTMLParser(recover=True, encoding=request_encoding) - tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) - except LookupError: - # blindly try decoding the body as utf-8. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - tree = etree.fromstring(body, parser) + try: + tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) + except Exception as e: + logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,)) + return {} + except Exception as e: + logger.warning("Failed to parse HTML body: %s" % (e,)) + return {} return _calc_og(tree, media_uri) From fd51df96e31c57a7280c570e2312632e137ad8f2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 21 Jan 2021 08:54:55 -0500 Subject: [PATCH 05/10] Add tests. --- synapse/rest/media/v1/preview_url_resource.py | 10 +++---- tests/test_preview.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 093d0bd9bd28..f7b70098c2f3 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -717,11 +717,7 @@ def decode_and_calc_og( parser = etree.HTMLParser(recover=True, encoding=request_encoding) except LookupError: # blindly consider the encoding as utf-8. - try: - parser = etree.HTMLParser(recover=True, encoding="utf-8") - except Exception as e: - logger.warning("Unable to create fallback HTML parser: %s" % (e,)) - return {} + parser = etree.HTMLParser(recover=True, encoding="utf-8") except Exception as e: logger.warning("Unable to create HTML parser: %s" % (e,)) return {} @@ -729,11 +725,13 @@ def decode_and_calc_og( # Attempt to parse the body. If this fails, log and return no metadata. try: tree = etree.fromstring(body, parser) + return _calc_og(tree, media_uri) except UnicodeDecodeError: # blindly try decoding the body as utf-8, which seems to fix # the charset mismatches on https://google.com try: tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) + return _calc_og(tree, media_uri) except Exception as e: logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,)) return {} @@ -741,8 +739,6 @@ def decode_and_calc_og( logger.warning("Failed to parse HTML body: %s" % (e,)) return {} - return _calc_og(tree, media_uri) - def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]: # suck our tree into lxml and define our OG response. diff --git a/tests/test_preview.py b/tests/test_preview.py index c19facc1cb70..0bc324beca8d 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -261,3 +261,30 @@ def test_empty(self): html = "" og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {}) + + def test_invalid_encoding(self): + """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" + html = """ + + Foo + + Some text. + + + """ + og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding") + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_invalid_encoding2(self): + """A body which doesn't match the sent character encoding.""" + # Note that this contains an invalid UTF-8 sequence in the title. + html = b""" + + \xff\xff Foo + + Some text. + + + """ + og = decode_and_calc_og(html, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) From 2b02f444419f4f8b674cdb3816d3972704d18dd4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 21 Jan 2021 08:55:45 -0500 Subject: [PATCH 06/10] Typo --- changelog.d/9164.bugfix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/9164.bugfix b/changelog.d/9164.bugfix index 48f95be97481..1c54a256c1b5 100644 --- a/changelog.d/9164.bugfix +++ b/changelog.d/9164.bugfix @@ -1 +1 @@ -Fix a longstanding bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding. +Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding. From 465c8e28a57ce0c3a1314ae819f4550e891fabaa Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 21 Jan 2021 11:34:51 -0500 Subject: [PATCH 07/10] Lint. --- tests/test_preview.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_preview.py b/tests/test_preview.py index 0bc324beca8d..0c6cbbd92148 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -272,7 +272,9 @@ def test_invalid_encoding(self): """ - og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding") + og = decode_and_calc_og( + html, "http://example.com/test.html", "invalid-encoding" + ) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_invalid_encoding2(self): From 58abad9bbb0abf4e409ef286ea3cb63fc2653715 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 21 Jan 2021 13:57:33 -0500 Subject: [PATCH 08/10] Params -> Args --- synapse/rest/media/v1/preview_url_resource.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f7b70098c2f3..0dbbcead0990 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -386,7 +386,7 @@ def _get_oembed_url(self, url: str) -> Optional[str]: """ Check whether the URL should be downloaded as oEmbed content instead. - Params: + Args: url: The URL to check. Returns: @@ -403,7 +403,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: """ Request content from an oEmbed endpoint. - Params: + Args: endpoint: The oEmbed API endpoint. url: The URL to pass to the API. @@ -698,7 +698,7 @@ def decode_and_calc_og( This uses lxml to parse the HTML document into the OG response. If errors occur during processing of the document, an empty response is returned. - Params: + Args: body: The HTML document, as bytes. media_url: The URI used to download the body. request_encoding: The character encoding of the body, as a string. From 911eec628b6ace6884a544407dbc01eea6ebe90b Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 25 Jan 2021 14:37:15 -0500 Subject: [PATCH 09/10] Add a missing type hint. --- synapse/rest/media/v1/preview_url_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0dbbcead0990..84a73422329e 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -740,7 +740,7 @@ def decode_and_calc_og( return {} -def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]: +def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: # suck our tree into lxml and define our OG response. # if we see any image URLs in the OG response, then spider them From f629933cd040ae6534333eccc7a719404c7f5d3c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 25 Jan 2021 14:44:11 -0500 Subject: [PATCH 10/10] Try to simplify code. --- synapse/rest/media/v1/preview_url_resource.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 84a73422329e..bf3be653aa6c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -722,22 +722,18 @@ def decode_and_calc_og( logger.warning("Unable to create HTML parser: %s" % (e,)) return {} + def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: + # Attempt to parse the body. If this fails, log and return no metadata. + tree = etree.fromstring(body_attempt, parser) + return _calc_og(tree, media_uri) + # Attempt to parse the body. If this fails, log and return no metadata. try: - tree = etree.fromstring(body, parser) - return _calc_og(tree, media_uri) + return _attempt_calc_og(body) except UnicodeDecodeError: # blindly try decoding the body as utf-8, which seems to fix # the charset mismatches on https://google.com - try: - tree = etree.fromstring(body.decode("utf-8", "ignore"), parser) - return _calc_og(tree, media_uri) - except Exception as e: - logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,)) - return {} - except Exception as e: - logger.warning("Failed to parse HTML body: %s" % (e,)) - return {} + return _attempt_calc_og(body.decode("utf-8", "ignore")) def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: