Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve URL previews for sites with only Twitter card information #13056

Merged
merged 6 commits into from Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13056.feature
@@ -0,0 +1 @@
Improve URL previews for sites which only provide Twitter Card metadata, e.g. LWN.net.
112 changes: 95 additions & 17 deletions synapse/rest/media/v1/preview_html.py
Expand Up @@ -15,7 +15,16 @@
import itertools
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from typing import (
TYPE_CHECKING,
Callable,
Dict,
Generator,
Iterable,
Optional,
Set,
Union,
)

if TYPE_CHECKING:
from lxml import etree
Expand Down Expand Up @@ -146,6 +155,70 @@ def decode_body(
return etree.fromstring(body, parser)


def _get_meta_tags(
tree: "etree.Element",
property: str,
prefix: str,
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
) -> Dict[str, Optional[str]]:
"""
Search for meta tags prefixed with a particular string.

Args:
tree: The parsed HTML document.
property: The name of the property which contains the tag name, e.g.
"property" for Open Graph.
prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
property_mapper: An optional callable to map the property to the Open Graph
form. Can return None for a key to ignore that key.

Returns:
A map of tag name to value.
"""
results: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(results) >= 50:
logger.warning(
"Skipping parsing of Open Graph for page with too many '%s:' tags",
prefix,
)
return {}

key = tag.attrib[property]
if property_mapper:
key = property_mapper(key)
# None is a special value used to ignore a value.
if key is None:
continue

results[key] = tag.attrib["content"]

return results


def _map_twitter_to_open_graph(key: str) -> Optional[str]:
"""
Map a Twitter card property to the analogous Open Graph property.

Args:
key: The Twitter card property (starts with "twitter:").

Returns:
The Open Graph property (starts with "og:") or None to have this property
be ignored.
"""
# Twitter card properties with no analogous Open Graph property.
if key == "twitter:card" or key == "twitter:creator":
return None
if key == "twitter:site":
return "og:site_name"
# Otherwise, swap twitter to og.
return "og" + key[7:]


def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
Expand All @@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
The Open Graph response as a dictionary.
"""

# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those
# URLs to avoid DoSing the server)

# Search for Open Graph (og:) meta tags, e.g.:
#
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
Expand All @@ -176,26 +247,33 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

og: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}

og[tag.attrib["property"]] = tag.attrib["content"]

# TODO: grab article: meta tags too, e.g.:
og = _get_meta_tags(tree, "property", "og")

# TODO: Search for properties specific to the different Open Graph types,
# such as article: meta tags, e.g.:
#
# "article:publisher" : "https://www.facebook.com/thethudonline" />
# "article:author" content="https://www.facebook.com/thethudonline" />
# "article:tag" content="baby" />
# "article:section" content="Breaking News" />
# "article:published_time" content="2016-03-31T19:58:24+00:00" />
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />

# Search for Twitter Card (twitter:) meta tags, e.g.:
#
# "twitter:site" : "@matrixdotorg"
# "twitter:creator" : "@matrixdotorg"
#
# Twitter cards tags also duplicate Open Graph tags.
#
# See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
# Merge the Twitter values with the Open Graph values, but do not overwrite
# information from Open Graph tags.
for key, value in twitter.items():
if key not in og:
og[key] = value

if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
Expand Down
41 changes: 41 additions & 0 deletions tests/rest/media/v1/test_html_preview.py
Expand Up @@ -370,6 +370,47 @@ def test_windows_1252(self) -> None:
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})

def test_twitter_tag(self) -> None:
"""Twitter card tags should be used if nothing else is available."""
html = b"""
<html>
<meta name="twitter:card" content="summary">
<meta name="twitter:description" content="Description">
<meta name="twitter:site" content="@matrixdotorg">
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Description",
"og:site_name": "@matrixdotorg",
},
)

# But they shouldn't override Open Graph values.
html = b"""
<html>
<meta name="twitter:card" content="summary">
<meta name="twitter:description" content="Description">
<meta property="og:description" content="Real Description">
<meta name="twitter:site" content="@matrixdotorg">
<meta property="og:site_name" content="matrix.org">
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Real Description",
"og:site_name": "matrix.org",
},
)


class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None:
Expand Down