Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Make oEmbed globs configurable and extract more info from the response #10392

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions synapse/config/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ def read_config(self, config, **kwargs):
"url_preview_accept_language"
) or ["en"]

self.oembed_globs = config.get("oembed_globs", {})

def generate_config_section(self, data_dir_path, **kwargs):
media_store = os.path.join(data_dir_path, "media_store")

Expand Down Expand Up @@ -366,6 +368,24 @@ def generate_config_section(self, data_dir_path, **kwargs):
#
url_preview_accept_language:
# - en

# TODO
# oembed_globs:
# "https://publish.twitter.com/oembed":
# - https://twitter.com/*/status/*
# - https://*.twitter.com/*/status/*
# - https://twitter.com/*/moments/*
# - https://*.twitter.com/*/moments/*
# # Include the HTTP versions too.
# - http://twitter.com/*/status/*
# - http://*.twitter.com/*/status/*
# - http://twitter.com/*/moments/*
# - http://*.twitter.com/*/moments/*
# "https://www.youtube.com/oembed":
# - https://*.youtube.com/watch*
# - https://*.youtube.com/v/*
# - https://youtu.be/*
# - https://*.youtube.com/playlist?list=*
"""
% locals()
)
102 changes: 56 additions & 46 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,56 +69,13 @@

ONE_HOUR = 60 * 60 * 1000

# A map of globs to API endpoints.
_oembed_globs = {
# Twitter.
"https://publish.twitter.com/oembed": [
"https://twitter.com/*/status/*",
"https://*.twitter.com/*/status/*",
"https://twitter.com/*/moments/*",
"https://*.twitter.com/*/moments/*",
# Include the HTTP versions too.
"http://twitter.com/*/status/*",
"http://*.twitter.com/*/status/*",
"http://twitter.com/*/moments/*",
"http://*.twitter.com/*/moments/*",
],
}
# Convert the globs to regular expressions.
_oembed_patterns = {}
for endpoint, globs in _oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)

# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))

pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
_oembed_patterns[re.compile(pattern)] = endpoint


@attr.s(slots=True)
class OEmbedResult:
# Either HTML content or URL must be provided.
html = attr.ib(type=Optional[str])
url = attr.ib(type=Optional[str])
title = attr.ib(type=Optional[str])
provider_name = attr.ib(type=Optional[str])
# Number of seconds to cache the content.
cache_age = attr.ib(type=int)

Expand Down Expand Up @@ -167,6 +124,34 @@ def __init__(
self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
self.url_preview_accept_language = hs.config.url_preview_accept_language

# Convert the globs to regular expressions.
self.oembed_patterns = {}
for endpoint, globs in hs.config.oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)

# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))

pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
self.oembed_patterns[re.compile(pattern)] = endpoint

# memory cache mapping urls to an ObservableDeferred returning
# JSON-encoded OG metadata
self._cache = ExpiringCache(
Expand Down Expand Up @@ -340,6 +325,15 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
logger.warning("Failed to find any OG data in %s", url)
og = {}

if not og.get("og:url"):
og["og:url"] = url

if not og.get("og:title") and media_info["title"]:
og["og:title"] = media_info["title"]

if not og.get("og:site_name") and media_info["site_name"]:
og["og:site_name"] = media_info["site_name"]

# filter out any stupidly long values
keys_to_remove = []
for k, v in og.items():
Expand Down Expand Up @@ -379,7 +373,7 @@ def _get_oembed_url(self, url: str) -> Optional[str]:
Returns:
A URL to use instead or None if the original URL should be used.
"""
for url_pattern, endpoint in _oembed_patterns.items():
for url_pattern, endpoint in self.oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint

Expand Down Expand Up @@ -420,7 +414,13 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
if cache_age:
cache_age = int(cache_age)

oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
oembed_result = OEmbedResult(
None,
None,
result.get("title", result.get("author_name")),
result.get("provider_name"),
cache_age,
)

# HTML content.
if oembed_type == "rich":
Expand Down Expand Up @@ -462,6 +462,7 @@ async def _download_url(self, url: str, user: str) -> Dict[str, Any]:
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url # type: Optional[str]
oembed_url = self._get_oembed_url(url)
oembed_result = None
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try:
Expand Down Expand Up @@ -542,6 +543,12 @@ async def _download_url(self, url: str, user: str) -> Dict[str, Any]:
code = 200
etag = None

title = None
site_name = None
if oembed_result:
title = oembed_result.title
site_name = oembed_result.provider_name

try:
time_now_ms = self.clock.time_msec()

Expand Down Expand Up @@ -573,6 +580,9 @@ async def _download_url(self, url: str, user: str) -> Dict[str, Any]:
"response_code": code,
"expires": expires,
"etag": etag,

"title": title,
"site_name": site_name,
}

def _start_expire_url_cache_data(self):
Expand Down