Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Try to recover from unknown media encodings. #9164

Merged
merged 11 commits into from Jan 26, 2021
Merged
1 change: 1 addition & 0 deletions changelog.d/9164.bugfix
@@ -0,0 +1 @@
Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.
44 changes: 34 additions & 10 deletions synapse/rest/media/v1/preview_url_resource.py
Expand Up @@ -386,7 +386,7 @@ def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.

Params:
Args:
url: The URL to check.

Returns:
Expand All @@ -403,7 +403,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.

Params:
Args:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.

Expand Down Expand Up @@ -692,27 +692,51 @@ async def _expire_url_cache_data(self) -> None:
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
"""
Calculate metadata for an HTML document.

This uses lxml to parse the HTML document into the OG response. If errors
occur during processing of the document, an empty response is returned.

Args:
body: The HTML document, as bytes.
media_url: The URI used to download the body.
request_encoding: The character encoding of the body, as a string.

Returns:
The OG response as a dictionary.
"""
# If there's no body, nothing useful is going to be found.
if not body:
return {}

from lxml import etree

# Create an HTML parser. If this fails, log and return no metadata.
try:
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body, parser)
og = _calc_og(tree, media_uri)
except LookupError:
# blindly consider the encoding as utf-8.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}

def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)
return _calc_og(tree, media_uri)

# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_calc_og(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
og = _calc_og(tree, media_uri)

return og
return _attempt_calc_og(body.decode("utf-8", "ignore"))


def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.

# if we see any image URLs in the OG response, then spider them
Expand Down
29 changes: 29 additions & 0 deletions tests/test_preview.py
Expand Up @@ -261,3 +261,32 @@ def test_empty(self):
html = ""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = """
<html>
<head><title>Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(
html, "http://example.com/test.html", "invalid-encoding"
)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding2(self):
"""A body which doesn't match the sent character encoding."""
# Note that this contains an invalid UTF-8 sequence in the title.
html = b"""
<html>
<head><title>\xff\xff Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})