Skip to content
This repository has been archived by the owner on Dec 13, 2023. It is now read-only.

Fix Content-Disposition in media repository #4176

Merged
merged 12 commits into from Nov 15, 2018
1 change: 1 addition & 0 deletions changelog.d/4176.bugfix
@@ -0,0 +1 @@
The media repository now no longer fails to decode UTF-8 filenames when downloading remote media.
119 changes: 94 additions & 25 deletions synapse/rest/media/v1/_base.py
Expand Up @@ -16,6 +16,7 @@
import logging
import os

from six import PY3
from six.moves import urllib

from twisted.internet import defer
Expand Down Expand Up @@ -48,26 +49,21 @@ def parse_media_id(request):
return server_name, media_id, file_name
except Exception:
raise SynapseError(
404,
"Invalid media id token %r" % (request.postpath,),
Codes.UNKNOWN,
404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
)


def respond_404(request):
respond_with_json(
request, 404,
cs_error(
"Not found %r" % (request.postpath,),
code=Codes.NOT_FOUND,
),
send_cors=True
request,
404,
cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
send_cors=True,
)


@defer.inlineCallbacks
def respond_with_file(request, media_type, file_path,
file_size=None, upload_name=None):
def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None):
logger.debug("Responding with %r", file_path)

if os.path.isfile(file_path):
Expand Down Expand Up @@ -97,31 +93,26 @@ def add_file_headers(request, media_type, file_size, upload_name):
file_size (int): Size in bytes of the media, if known.
upload_name (str): The name of the requested file, if any.
"""

def _quote(x):
return urllib.parse.quote(x.encode("utf-8"))

request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
if upload_name:
if is_ascii(upload_name):
disposition = ("inline; filename=%s" % (_quote(upload_name),)).encode("ascii")
disposition = "inline; filename=%s" % (_quote(upload_name),)
else:
disposition = (
"inline; filename*=utf-8''%s" % (_quote(upload_name),)).encode("ascii")
disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)

request.setHeader(b"Content-Disposition", disposition)
request.setHeader(b"Content-Disposition", disposition.encode('ascii'))

# cache for at least a day.
# XXX: we might want to turn this off for data we don't want to
# recommend caching as it's sensitive or private - or at least
# select private. don't bother setting Expires as all our
# clients are smart enough to be happy with Cache-Control
request.setHeader(
b"Cache-Control", b"public,max-age=86400,s-maxage=86400"
)

request.setHeader(
b"Content-Length", b"%d" % (file_size,)
)
request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
request.setHeader(b"Content-Length", b"%d" % (file_size,))


@defer.inlineCallbacks
Expand Down Expand Up @@ -153,6 +144,7 @@ class Responder(object):
Responder is a context manager which *must* be used, so that any resources
held can be cleaned up.
"""

def write_to_consumer(self, consumer):
"""Stream response into consumer

Expand Down Expand Up @@ -186,9 +178,18 @@ class FileInfo(object):
thumbnail_method (str)
thumbnail_type (str): Content type of thumbnail, e.g. image/png
"""
def __init__(self, server_name, file_id, url_cache=False,
thumbnail=False, thumbnail_width=None, thumbnail_height=None,
thumbnail_method=None, thumbnail_type=None):

def __init__(
self,
server_name,
file_id,
url_cache=False,
thumbnail=False,
thumbnail_width=None,
thumbnail_height=None,
thumbnail_method=None,
thumbnail_type=None,
):
self.server_name = server_name
self.file_id = file_id
self.url_cache = url_cache
Expand All @@ -197,3 +198,71 @@ def __init__(self, server_name, file_id, url_cache=False,
self.thumbnail_height = thumbnail_height
self.thumbnail_method = thumbnail_method
self.thumbnail_type = thumbnail_type


def get_filename_from_headers(headers):
"""
Get the filename of the downloaded file by inspecting the
Content-Disposition HTTP header.

Args:
headers (twisted.web.http_headers.Headers): The HTTP
request headers.

Returns:
A Unicode string of the filename, or None.
"""
content_disposition = headers.get(b"Content-Disposition", [b''])

# No header, bail out.
if not content_disposition[0]:
return

params = {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd find a comment which documents the type of this useful. It seems to be a map from a unicode to a bytes?

(Edit: I see that you kinda document this below, but I'd find it clearer here. Also the terms "decoded" and "unencoded" are pretty overloaded and unclear here)

parts = content_disposition[0].split(b";")
for i in parts:
# Split into key-value pairs, if able
if b"=" not in i:
continue

key, value = i.strip().split(b"=")
# Store it with a decoded key and unencoded value
params[key.decode('ascii')] = value

upload_name = None

# First check if there is a valid UTF-8 filename
upload_name_utf8 = params.get("filename*", None)
if upload_name_utf8:
if upload_name_utf8.lower().startswith(b"utf-8''"):
upload_name_utf8 = upload_name_utf8[7:]
if PY3:
try:
# We have a filename*= section. This MUST be ASCII, and any
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Half of this comment seems to apply to PY2 as well as PY3, so could it be pulled up? Also "quoted" doesn't mean much to me, even if that's what urllib calls it. Can we call it "%-encoded" or "%-escaped" or something?

# UTF-8 bytes are quoted. Once it is decoded, we can then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"decoded" is unclear to me. Can we say something like:

First decode the ascii bytes to a str, then we can %-decode it safely

# unquote it strictly.
upload_name = urllib.parse.unquote(
upload_name_utf8.decode('ascii'), errors="strict"
)
except UnicodeDecodeError:
# Incorrect UTF-8.
pass
else:
# On Python 2, we can unquote it directly, and then decode it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# On Python 2, we can unquote it directly, and then decode it
# On Python 2, we can %-decode it directly, and then decode the utf8 bytes to a unicode

# strictly.
try:
upload_name = urllib.parse.unquote(upload_name_utf8).decode('utf8')
except UnicodeDecodeError:
pass

# If there isn't check for an ascii name.
if not upload_name:
upload_name_ascii = params.get("filename", None)
if upload_name_ascii and is_ascii(upload_name_ascii):
# Make sure there's no percent-escaped bytes. If there is, reject it
# as non-valid ASCII.
if b"%" not in upload_name_ascii:
upload_name = upload_name_ascii.decode('ascii')

# This may be None here, indicating we did not find a matching name.
return upload_name
48 changes: 10 additions & 38 deletions synapse/rest/media/v1/media_repository.py
Expand Up @@ -14,14 +14,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import cgi
import errno
import logging
import os
import shutil

from six import PY3, iteritems
from six.moves.urllib import parse as urlparse
from six import iteritems

import twisted.internet.error
import twisted.web.http
Expand All @@ -34,14 +32,18 @@
NotFoundError,
SynapseError,
)
from synapse.http.matrixfederationclient import MatrixFederationHttpClient
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import logcontext
from synapse.util.async_helpers import Linearizer
from synapse.util.retryutils import NotRetryingDestination
from synapse.util.stringutils import is_ascii, random_string
from synapse.util.stringutils import random_string

from ._base import FileInfo, respond_404, respond_with_responder
from ._base import (
FileInfo,
get_filename_from_headers,
respond_404,
respond_with_responder,
)
from .config_resource import MediaConfigResource
from .download_resource import DownloadResource
from .filepath import MediaFilePaths
Expand All @@ -62,7 +64,7 @@ class MediaRepository(object):
def __init__(self, hs):
self.hs = hs
self.auth = hs.get_auth()
self.client = MatrixFederationHttpClient(hs)
self.client = hs.get_http_client()
self.clock = hs.get_clock()
self.server_name = hs.hostname
self.store = hs.get_datastore()
Expand Down Expand Up @@ -397,39 +399,9 @@ def _download_remote_file(self, server_name, media_id, file_id):
yield finish()

media_type = headers[b"Content-Type"][0].decode('ascii')

upload_name = get_filename_from_headers(headers)
time_now_ms = self.clock.time_msec()

content_disposition = headers.get(b"Content-Disposition", None)
if content_disposition:
_, params = cgi.parse_header(content_disposition[0].decode('ascii'),)
upload_name = None

# First check if there is a valid UTF-8 filename
upload_name_utf8 = params.get("filename*", None)
if upload_name_utf8:
if upload_name_utf8.lower().startswith("utf-8''"):
upload_name = upload_name_utf8[7:]

# If there isn't check for an ascii name.
if not upload_name:
upload_name_ascii = params.get("filename", None)
if upload_name_ascii and is_ascii(upload_name_ascii):
upload_name = upload_name_ascii

if upload_name:
if PY3:
upload_name = urlparse.unquote(upload_name)
else:
upload_name = urlparse.unquote(upload_name.encode('ascii'))
try:
if isinstance(upload_name, bytes):
upload_name = upload_name.decode("utf-8")
except UnicodeDecodeError:
upload_name = None
else:
upload_name = None

logger.info("Stored remote media in file %r", fname)

yield self.store.store_cached_remote_media(
Expand Down
30 changes: 3 additions & 27 deletions synapse/rest/media/v1/preview_url_resource.py
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import cgi
import datetime
import errno
import fnmatch
Expand Down Expand Up @@ -44,10 +43,11 @@
)
from synapse.http.servlet import parse_integer, parse_string
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background
from synapse.util.stringutils import is_ascii, random_string
from synapse.util.stringutils import random_string

from ._base import FileInfo

Expand Down Expand Up @@ -336,31 +336,7 @@ def _download_url(self, url, user):
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()

content_disposition = headers.get(b"Content-Disposition", None)
if content_disposition:
_, params = cgi.parse_header(content_disposition[0],)
download_name = None

# First check if there is a valid UTF-8 filename
download_name_utf8 = params.get("filename*", None)
if download_name_utf8:
if download_name_utf8.lower().startswith("utf-8''"):
download_name = download_name_utf8[7:]

# If there isn't check for an ascii name.
if not download_name:
download_name_ascii = params.get("filename", None)
if download_name_ascii and is_ascii(download_name_ascii):
download_name = download_name_ascii

if download_name:
download_name = urlparse.unquote(download_name)
try:
download_name = download_name.decode("utf-8")
except UnicodeDecodeError:
download_name = None
else:
download_name = None
download_name = get_filename_from_headers(headers)

yield self.store.store_local_media(
media_id=file_id,
Expand Down