Join GitHub today
GitHub is home to over 20 million developers working together to host and review code, manage projects, and build software together.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
Already on GitHub? Sign in to your account
URL previewing support #688
Conversation
ara4n
added some commits
Jan 24, 2016
ara4n
assigned
NegativeMjark
Apr 3, 2016
ara4n
referenced this pull request
in vector-im/riot-web
Apr 3, 2016
Closed
Embed URL previews using opengraph tags and similar #184
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| + del og["og:image"] | ||
| + | ||
| + if 'og:description' not in og: | ||
| + meta_description = tree.xpath( | ||
| + "//*/meta" | ||
| + "[translate(@name, 'DESCRIPTION', 'description')='description']" | ||
| + "/@content") | ||
| + if meta_description: | ||
| + og['og:description'] = meta_description[0] | ||
| + else: | ||
| + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " | ||
| + # "//p/text() | //div/text() | //span/text() | //a/text()") | ||
| + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " | ||
| + "ancestor::aside | ancestor::footer | " | ||
| + "ancestor::script | ancestor::style)]" + | ||
| + "[ancestor::body]") |
|
|
NegativeMjark
and 1 other
commented on an outdated diff
Apr 4, 2016
| + | ||
| + # store OG in history-aware DB cache | ||
| + yield self.store.store_url_cache( | ||
| + url, | ||
| + media_info["response_code"], | ||
| + media_info["etag"], | ||
| + media_info["expires"], | ||
| + json.dumps(og), | ||
| + media_info["filesystem_id"], | ||
| + media_info["created_ts"], | ||
| + ) | ||
| + | ||
| + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) | ||
| + except: | ||
| + # XXX: if we don't explicitly respond here, the request never returns. | ||
| + # isn't this what server.py's wrapper is meant to be doing for us? |
ara4n
Owner
|
NegativeMjark
commented on the diff
Apr 4, 2016
| + def _rebase_url(self, url, base): | ||
| + base = list(urlparse(base)) | ||
| + url = list(urlparse(url)) | ||
| + if not url[0]: # fix up schema | ||
| + url[0] = base[0] or "http" | ||
| + if not url[1]: # fix up hostname | ||
| + url[1] = base[1] | ||
| + if not url[2].startswith('/'): | ||
| + url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] | ||
| + return urlunparse(url) | ||
| + | ||
| + @defer.inlineCallbacks | ||
| + def _download_url(self, url, user): | ||
| + # TODO: we should probably honour robots.txt... except in practice | ||
| + # we're most likely being explicitly triggered by a human rather than a | ||
| + # bot, so are we really a robot? |
NegativeMjark
Contributor
|
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| @@ -50,6 +50,59 @@ def store_local_media(self, media_id, media_type, time_now_ms, upload_name, | ||
| desc="store_local_media", | ||
| ) | ||
| + def get_url_cache(self, url, ts): | ||
| + """Get the media_id and ts for a cached URL as of the given timestamp | ||
| + Returns: | ||
| + None if the URL isn't cached. | ||
| + """ | ||
| + def get_url_cache_txn(txn): | ||
| + # get the most recently cached result (relative to the given ts) | ||
| + sql = ( | ||
| + "SELECT response_code, etag, expires, og, media_id, max(download_ts)" |
NegativeMjark
Contributor
|
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| + # get the most recently cached result (relative to the given ts) | ||
| + sql = ( | ||
| + "SELECT response_code, etag, expires, og, media_id, max(download_ts)" | ||
| + " FROM local_media_repository_url_cache" | ||
| + " WHERE url = ? AND download_ts <= ?" | ||
| + ) | ||
| + txn.execute(sql, (url, ts)) | ||
| + row = txn.fetchone() | ||
| + | ||
| + if not row[3]: | ||
| + # ...or if we've requested a timestamp older than the oldest | ||
| + # copy in the cache, return the oldest copy (if any) | ||
| + sql = ( | ||
| + "SELECT response_code, etag, expires, og, media_id, min(download_ts)" | ||
| + " FROM local_media_repository_url_cache" | ||
| + " WHERE url = ? AND download_ts > ?" |
NegativeMjark
Contributor
|
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| +# you may not use this file except in compliance with the License. | ||
| +# You may obtain a copy of the License at | ||
| +# | ||
| +# http://www.apache.org/licenses/LICENSE-2.0 | ||
| +# | ||
| +# Unless required by applicable law or agreed to in writing, software | ||
| +# distributed under the License is distributed on an "AS IS" BASIS, | ||
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| +# See the License for the specific language governing permissions and | ||
| +# limitations under the License. | ||
| + | ||
| +from .base_resource import BaseMediaResource | ||
| + | ||
| +from twisted.web.server import NOT_DONE_YET | ||
| +from twisted.internet import defer | ||
| +from lxml import html |
NegativeMjark
Contributor
|
|
I think you need to run |
|
Can we make the entire thing optional somehow? We probably can't run it by default anyway given that it needs an IP blacklist. |
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| + @defer.inlineCallbacks | ||
| + def _async_render_GET(self, request): | ||
| + | ||
| + try: | ||
| + # XXX: if get_user_by_req fails, what should we do in an async render? | ||
| + requester = yield self.auth.get_user_by_req(request) | ||
| + url = request.args.get("url")[0] | ||
| + if "ts" in request.args: | ||
| + ts = int(request.args.get("ts")[0]) | ||
| + else: | ||
| + ts = self.clock.time_msec() | ||
| + | ||
| + # first check the memory cache - good to handle all the clients on this | ||
| + # HS thundering away to preview the same URL at the same time. | ||
| + try: | ||
| + og = self.cache[url] |
|
|
NegativeMjark
commented on an outdated diff
Apr 4, 2016
| + | ||
| + if dims: | ||
| + og["og:image:width"] = dims['width'] | ||
| + og["og:image:height"] = dims['height'] | ||
| + else: | ||
| + logger.warn("Couldn't get dims for %s" % url) | ||
| + | ||
| + # define our OG response for this media | ||
| + elif self._is_html(media_info['media_type']): | ||
| + # TODO: somehow stop a big HTML tree from exploding synapse's RAM | ||
| + | ||
| + try: | ||
| + tree = html.parse(media_info['filename']) | ||
| + og = yield self._calc_og(tree, media_info, requester) | ||
| + except UnicodeDecodeError: | ||
| + # XXX: evil evil bodge |
|
|
ara4n
added some commits
Apr 7, 2016
|
incorporate all the PR feedback - @NegativeMjark PTAL |
ara4n
and others
added some commits
Apr 8, 2016
NegativeMjark
commented on an outdated diff
Apr 8, 2016
| +import ujson as json | ||
| + | ||
| +import logging | ||
| +logger = logging.getLogger(__name__) | ||
| + | ||
| +try: | ||
| + from lxml import html | ||
| +except ImportError: | ||
| + pass | ||
| + | ||
| + | ||
| +class PreviewUrlResource(BaseMediaResource): | ||
| + isLeaf = True | ||
| + | ||
| + def __init__(self, hs, filepaths): | ||
| + if not html: |
|
|
NegativeMjark
commented on an outdated diff
Apr 8, 2016
| +import logging | ||
| +logger = logging.getLogger(__name__) | ||
| + | ||
| +try: | ||
| + from lxml import html | ||
| +except ImportError: | ||
| + pass | ||
| + | ||
| + | ||
| +class PreviewUrlResource(BaseMediaResource): | ||
| + isLeaf = True | ||
| + | ||
| + def __init__(self, hs, filepaths): | ||
| + if not html: | ||
| + logger.warn("Disabling PreviewUrlResource as lxml not available") | ||
| + raise |
|
|
|
@NegativeMjark addressed these too, and now throwing sensible exceptions. PTAL |
NegativeMjark
commented on an outdated diff
Apr 11, 2016
| + isLeaf = True | ||
| + | ||
| + def __init__(self, hs, filepaths): | ||
| + try: | ||
| + if html: | ||
| + pass | ||
| + except: | ||
| + raise RunTimeError("Disabling PreviewUrlResource as lxml not available") | ||
| + | ||
| + if not hasattr(hs.config, "url_preview_ip_range_blacklist"): | ||
| + logger.warn( | ||
| + "For security, you must specify an explicit target IP address " | ||
| + "blacklist in url_preview_ip_range_blacklist for url previewing " | ||
| + "to work" | ||
| + ) | ||
| + raise RunTimeError( |
NegativeMjark
Contributor
|
NegativeMjark
commented on an outdated diff
Apr 11, 2016
| + self.cache[url] = og | ||
| + | ||
| + # store OG in history-aware DB cache | ||
| + yield self.store.store_url_cache( | ||
| + url, | ||
| + media_info["response_code"], | ||
| + media_info["etag"], | ||
| + media_info["expires"], | ||
| + json.dumps(og), | ||
| + media_info["filesystem_id"], | ||
| + media_info["created_ts"], | ||
| + ) | ||
| + | ||
| + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) | ||
| + except Exception as e: | ||
| + raise e |
|
|
|
Other than fixing the typo's and style warnings, it LGTM. I'm slightly concerned by the lack of tests for it though. |
ara4n commentedApr 3, 2016
SpiderHttpClientderived fromSimpleHttpClient, which follows redirects and handles gzip CTE correctlyget_filesupport toSimpleHttpClient, knowingly duplicated for now from matrixfederationclient.preview_url_resourceto implement the new media/r0/preview_url API. This:lxml, returning the metadata as a JSON bloblocal_media_repository_url_cachetable to the DB for the on-disk URL cacheget_url_cacheandstore_url_cachetomedia_repository.pyto wrap the new tableN.B. that following redirects will not work correctly until https://twistedmatrix.com/trac/ticket/8265 is merged. Unsure if it's worth maintaining our own Twisted fork until that happens.
Given I'm hardly a python/twisted expert, review would be particularly appreciated on:
This is part of a set of PRs spanning vector-web, matrix-react-sdk, matrix-js-sdk and synapse.
See also vector-im/vector-web#1343 and matrix-org/matrix-react-sdk#260 and matrix-org/matrix-js-sdk#122