Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Clean-up logic for rebasing URLs #12219

Merged
merged 7 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/12219.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Clean-up logic around rebasing URLs for URL image previews.
39 changes: 2 additions & 37 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from urllib import parse as urlparse

if TYPE_CHECKING:
from lxml import etree
Expand Down Expand Up @@ -144,9 +143,7 @@ def decode_body(
return etree.fromstring(body, parser)


def parse_html_to_open_graph(
tree: "etree.Element", media_uri: str
) -> Dict[str, Optional[str]]:
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.

clokep marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -155,7 +152,6 @@ def parse_html_to_open_graph(

Args:
tree: The parsed HTML document.
media_url: The URI used to download the body.

Returns:
The Open Graph response as a dictionary.
Expand Down Expand Up @@ -209,7 +205,7 @@ def parse_html_to_open_graph(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
)
if meta_image:
og["og:image"] = rebase_url(meta_image[0], media_uri)
og["og:image"] = meta_image[0]
else:
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
Expand Down Expand Up @@ -320,37 +316,6 @@ def _iterate_over_text(
)


def rebase_url(url: str, base: str) -> str:
"""
Resolves a potentially relative `url` against an absolute `base` URL.

For example:

>>> rebase_url("subpage", "https://example.com/foo/")
'https://example.com/foo/subpage'
>>> rebase_url("sibling", "https://example.com/foo")
'https://example.com/sibling'
>>> rebase_url("/bar", "https://example.com/foo/")
'https://example.com/bar'
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
'https://alice.com/a'
"""
base_parts = urlparse.urlparse(base)
# Convert the parsed URL to a list for (potential) modification.
url_parts = list(urlparse.urlparse(url))
# Add a scheme, if one does not exist.
if not url_parts[0]:
url_parts[0] = base_parts.scheme or "http"
# Fix up the hostname, if this is not a data URL.
if url_parts[0] != "data" and not url_parts[1]:
url_parts[1] = base_parts.netloc
# If the path does not start with a /, nest it under the base path's last
# directory.
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
return urlparse.urlunparse(url_parts)


def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
Expand Down
23 changes: 12 additions & 11 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import sys
import traceback
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
from urllib import parse as urlparse
from urllib.parse import urljoin, urlparse, urlsplit
from urllib.request import urlopen

import attr
Expand All @@ -44,11 +44,7 @@
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.oembed import OEmbedProvider
from synapse.rest.media.v1.preview_html import (
decode_body,
parse_html_to_open_graph,
rebase_url,
)
from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
from synapse.types import JsonDict, UserID
from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred
Expand Down Expand Up @@ -187,7 +183,7 @@ async def _async_render_GET(self, request: SynapseRequest) -> None:
ts = self.clock.time_msec()

# XXX: we could move this into _do_preview if we wanted.
url_tuple = urlparse.urlsplit(url)
url_tuple = urlsplit(url)
for entry in self.url_preview_url_blacklist:
match = True
for attrib in entry:
Expand Down Expand Up @@ -322,7 +318,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:

# Parse Open Graph information from the HTML in case the oEmbed
# response failed or is incomplete.
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
og_from_html = parse_html_to_open_graph(tree)

# Compile the Open Graph response by using the scraped
# information from the HTML and overlaying any information
Expand Down Expand Up @@ -588,12 +584,17 @@ async def _precache_image_url(
if "og:image" not in og or not og["og:image"]:
return

# The image URL from the HTML might be relative to the previewed page,
# convert it to an URL which can be requested directly.
image_url = og["og:image"]
url_parts = urlparse(image_url)
if url_parts.scheme != "data":
image_url = urljoin(media_info.uri, image_url)

# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._handle_url(
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
)
image_info = await self._handle_url(image_url, user, allow_data_urls=True)

if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
Expand Down
54 changes: 11 additions & 43 deletions tests/rest/media/v1/test_html_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
rebase_url,
summarize_paragraphs,
)

Expand Down Expand Up @@ -161,7 +160,7 @@ def test_simple(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -177,7 +176,7 @@ def test_comment(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -196,7 +195,7 @@ def test_comment2(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(
og,
Expand All @@ -218,7 +217,7 @@ def test_script(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -232,7 +231,7 @@ def test_missing_title(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

Expand All @@ -247,7 +246,7 @@ def test_h1_as_title(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

Expand All @@ -262,7 +261,7 @@ def test_missing_title_and_broken_h1(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

Expand Down Expand Up @@ -290,7 +289,7 @@ def test_xml(self) -> None:
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding(self) -> None:
Expand All @@ -304,7 +303,7 @@ def test_invalid_encoding(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding2(self) -> None:
Expand All @@ -319,7 +318,7 @@ def test_invalid_encoding2(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

def test_windows_1252(self) -> None:
Expand All @@ -333,7 +332,7 @@ def test_windows_1252(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})


Expand Down Expand Up @@ -448,34 +447,3 @@ def test_unknown_invalid(self) -> None:
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])


class RebaseUrlTestCase(unittest.TestCase):
def test_relative(self) -> None:
"""Relative URLs should be resolved based on the context of the base URL."""
self.assertEqual(
rebase_url("subpage", "https://example.com/foo/"),
"https://example.com/foo/subpage",
)
self.assertEqual(
rebase_url("sibling", "https://example.com/foo"),
"https://example.com/sibling",
)
self.assertEqual(
rebase_url("/bar", "https://example.com/foo/"),
"https://example.com/bar",
)

def test_absolute(self) -> None:
"""Absolute URLs should not be modified."""
self.assertEqual(
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
"https://alice.com/a/",
)

def test_data(self) -> None:
"""Data URLs should not be modified."""
self.assertEqual(
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
"data:,Hello%2C%20World%21",
)