From b65fe274fe6abc2102f237d042ca5470fd61d8bb Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Thu, 8 Nov 2018 05:49:38 +1100 Subject: [PATCH 01/10] fixes --- .coveragerc | 12 ++++++++++++ synapse/rest/media/v1/media_repository.py | 16 +++++++++++++--- tox.ini | 15 +++++++++++++-- 3 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000000..ca333961f3b4 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,12 @@ +[run] +branch = True +parallel = True +source = synapse + +[paths] +source= + coverage + +[report] +precision = 2 +ignore_errors = True diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index d6c5f07af0d4..1a05027673cd 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -400,9 +400,19 @@ def _download_remote_file(self, server_name, media_id, file_id): time_now_ms = self.clock.time_msec() - content_disposition = headers.get(b"Content-Disposition", None) + content_disposition = headers.get(b"Content-Disposition", b'') + + # Decode the Content-Disposition header (cgi.parse_header requires + # unicode on Python 3, not bytes) the best we can. + try: + content_disposition = content_disposition[0].decode('utf8') + except UnicodeDecodeError: + # Wasn't valid UTF-8, therefore not valid ASCII. Give up on figuring + # out what the mess they've sent is. + content_disposition = None + if content_disposition: - _, params = cgi.parse_header(content_disposition[0].decode('ascii'),) + _, params = cgi.parse_header(content_disposition) upload_name = None # First check if there is a valid UTF-8 filename @@ -421,7 +431,7 @@ def _download_remote_file(self, server_name, media_id, file_id): if PY3: upload_name = urlparse.unquote(upload_name) else: - upload_name = urlparse.unquote(upload_name.encode('ascii')) + upload_name = urlparse.unquote(upload_name.encode('utf8')) try: if isinstance(upload_name, bytes): upload_name = upload_name.decode("utf-8") diff --git a/tox.ini b/tox.ini index 920211bf50f8..eb1d9f809495 100644 --- a/tox.ini +++ b/tox.ini @@ -70,7 +70,7 @@ usedevelop=true usedevelop=true deps = {[base]deps} - psycopg2 + psycopg2 setenv = {[base]setenv} SYNAPSE_POSTGRES = 1 @@ -101,11 +101,22 @@ usedevelop=true [testenv:py36] usedevelop=true + +[testenv:py36-coverage] +usedevelop=true +deps = + {[base]deps} + coverage +commands = + /usr/bin/find "{toxinidir}" -name '*.pyc' -delete + python -m coverage run -m twisted.trial {env:TRIAL_FLAGS:} {posargs:tests} {env:TOXSUFFIX:} + + [testenv:py36-postgres] usedevelop=true deps = {[base]deps} - psycopg2 + psycopg2 setenv = {[base]setenv} SYNAPSE_POSTGRES = 1 From ac1149d282c83c0325414822cc7a37c80b67657e Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 9 Nov 2018 16:27:46 -0600 Subject: [PATCH 02/10] some initial testing --- synapse/rest/media/v1/media_repository.py | 3 +- tests/rest/media/v1/test_media_storage.py | 147 ++++++++++++++++++++++ 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 1a05027673cd..f3b11a5e4a91 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -34,7 +34,6 @@ NotFoundError, SynapseError, ) -from synapse.http.matrixfederationclient import MatrixFederationHttpClient from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util import logcontext from synapse.util.async_helpers import Linearizer @@ -62,7 +61,7 @@ class MediaRepository(object): def __init__(self, hs): self.hs = hs self.auth = hs.get_auth() - self.client = MatrixFederationHttpClient(hs) + self.client = hs.get_http_client() self.clock = hs.get_clock() self.server_name = hs.hostname self.store = hs.get_datastore() diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py index a86901c2d833..e54a7a4c4290 100644 --- a/tests/rest/media/v1/test_media_storage.py +++ b/tests/rest/media/v1/test_media_storage.py @@ -14,6 +14,17 @@ # limitations under the License. +import os + +from mock import Mock + +from twisted.internet.defer import Deferred + +from synapse.config.repository import MediaStorageProviderConfig +from synapse.util.module_loader import load_module + +from tests import unittest + import os import shutil import tempfile @@ -83,3 +94,139 @@ def test_ensure_media_is_in_local_cache(self): body = f.read() self.assertEqual(test_body, body) + + +class MediaRepoTests(unittest.HomeserverTestCase): + + hijack_auth = True + user_id = "@test:user" + + def make_homeserver(self, reactor, clock): + + self.fetches = [] + + def get_file(url, output_stream, max_size): + """ + Returns tuple[int,dict,str,int] of file length, response headers, + absolute URI, and response code. + """ + + def write_to(r): + data, response = r + output_stream.write(data) + return response + + d = Deferred() + d.addCallback(write_to) + self.fetches.append((d, url)) + return d + + client = Mock() + client.get_file = get_file + + self.storage_path = self.mktemp() + os.mkdir(self.storage_path) + + config = self.default_config() + config.media_store_path = self.storage_path + + provider_config = { + "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend", + "store_local": True, + "store_synchronous": False, + "store_remote": True, + "config": {"directory": self.storage_path}, + } + + loaded = list(load_module(provider_config)) + [ + MediaStorageProviderConfig(False, False, False) + ] + + config.media_storage_providers = [loaded] + + hs = self.setup_test_homeserver(config=config, http_client=client) + + return hs + + def prepare(self, reactor, clock, hs): + + self.media_repo = hs.get_media_repository_resource() + self.download_resource = self.media_repo.children[b'download_resource'] + + def test_get_remote_media(self): + + request, channel = self.make_request( + "GET", "download/example.com/12345", shorthand=False + ) + request.render(self.download_resource) + self.pump() + + # We've made one fetch + self.assertEqual(len(self.fetches), 1) + + end_content = ( + b'' + b'' + b'' + b'' + ) + + self.fetches[0][0].callback( + ( + end_content, + ( + len(end_content), + { + b"Content-Length": [b"%d" % (len(end_content))], + b"Content-Type": [b'text/html; charset="utf8"'], + }, + "https://example.com", + 200, + ), + ) + ) + + self.pump() + self.assertEqual(channel.code, 200) + + print(request) + self.assertEqual( + channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} + ) + + # Check the cache returns the correct response + request, channel = self.make_request( + "GET", "url_preview?url=matrix.org", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + # Only one fetch, still, since we'll lean on the cache + self.assertEqual(len(self.fetches), 1) + + # Check the cache response has the same content + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} + ) + + # Clear the in-memory cache + self.assertIn("matrix.org", self.preview_url._cache) + self.preview_url._cache.pop("matrix.org") + self.assertNotIn("matrix.org", self.preview_url._cache) + + # Check the database cache returns the correct response + request, channel = self.make_request( + "GET", "url_preview?url=matrix.org", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + # Only one fetch, still, since we'll lean on the cache + self.assertEqual(len(self.fetches), 1) + + # Check the cache response has the same content + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} + ) From a387992de12da43147ec67b0f2af5242dee77cd9 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 12 Nov 2018 15:37:19 -0600 Subject: [PATCH 03/10] fix --- synapse/rest/media/v1/media_repository.py | 2 +- tests/rest/media/v1/test_media_storage.py | 125 +++++++++++----------- tests/server.py | 15 +++ 3 files changed, 81 insertions(+), 61 deletions(-) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index f3b11a5e4a91..acab80d968f8 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -399,7 +399,7 @@ def _download_remote_file(self, server_name, media_id, file_id): time_now_ms = self.clock.time_msec() - content_disposition = headers.get(b"Content-Disposition", b'') + content_disposition = headers.get(b"Content-Disposition", [b'']) # Decode the Content-Disposition header (cgi.parse_header requires # unicode on Python 3, not bytes) the best we can. diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py index e54a7a4c4290..7841b7243685 100644 --- a/tests/rest/media/v1/test_media_storage.py +++ b/tests/rest/media/v1/test_media_storage.py @@ -14,29 +14,23 @@ # limitations under the License. -import os - -from mock import Mock - -from twisted.internet.defer import Deferred - -from synapse.config.repository import MediaStorageProviderConfig -from synapse.util.module_loader import load_module - -from tests import unittest - import os import shutil import tempfile +from binascii import unhexlify from mock import Mock +from six.moves.urllib import parse from twisted.internet import defer, reactor +from twisted.internet.defer import Deferred +from synapse.config.repository import MediaStorageProviderConfig from synapse.rest.media.v1._base import FileInfo from synapse.rest.media.v1.filepath import MediaFilePaths from synapse.rest.media.v1.media_storage import MediaStorage from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend +from synapse.util.module_loader import load_module from tests import unittest @@ -105,7 +99,7 @@ def make_homeserver(self, reactor, clock): self.fetches = [] - def get_file(url, output_stream, max_size): + def get_file(destination, path, output_stream, args=None, max_size=None): """ Returns tuple[int,dict,str,int] of file length, response headers, absolute URI, and response code. @@ -118,7 +112,7 @@ def write_to(r): d = Deferred() d.addCallback(write_to) - self.fetches.append((d, url)) + self.fetches.append((d, destination, path, args)) return d client = Mock() @@ -129,6 +123,8 @@ def write_to(r): config = self.default_config() config.media_store_path = self.storage_path + config.thumbnail_requirements = {} + config.max_image_pixels = 2000000 provider_config = { "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend", @@ -151,37 +147,42 @@ def write_to(r): def prepare(self, reactor, clock, hs): self.media_repo = hs.get_media_repository_resource() - self.download_resource = self.media_repo.children[b'download_resource'] + self.download_resource = self.media_repo.children[b'download'] + + # smol png + self.end_content = unhexlify( + b"89504e470d0a1a0a0000000d4948445200000001000000010806" + b"0000001f15c4890000000a49444154789c63000100000500010d" + b"0a2db40000000049454e44ae426082" + ) - def test_get_remote_media(self): + def _req(self, content_disposition): request, channel = self.make_request( - "GET", "download/example.com/12345", shorthand=False + "GET", "example.com/12345", shorthand=False ) request.render(self.download_resource) self.pump() - # We've made one fetch + # We've made one fetch, to example.com, using the media URL, and asking + # the other server not to do a remote fetch self.assertEqual(len(self.fetches), 1) - - end_content = ( - b'' - b'' - b'' - b'' + self.assertEqual(self.fetches[0][1], "example.com") + self.assertEqual( + self.fetches[0][2], "/_matrix/media/v1/download/example.com/12345" ) + self.assertEqual(self.fetches[0][3], {"allow_remote": "false"}) self.fetches[0][0].callback( ( - end_content, + self.end_content, ( - len(end_content), + len(self.end_content), { - b"Content-Length": [b"%d" % (len(end_content))], - b"Content-Type": [b'text/html; charset="utf8"'], + b"Content-Length": [b"%d" % (len(self.end_content))], + b"Content-Type": [b'image/png'], + b"Content-Disposition": [content_disposition], }, - "https://example.com", - 200, ), ) ) @@ -189,44 +190,48 @@ def test_get_remote_media(self): self.pump() self.assertEqual(channel.code, 200) - print(request) - self.assertEqual( - channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} - ) - - # Check the cache returns the correct response - request, channel = self.make_request( - "GET", "url_preview?url=matrix.org", shorthand=False - ) - request.render(self.preview_url) - self.pump() + return channel - # Only one fetch, still, since we'll lean on the cache - self.assertEqual(len(self.fetches), 1) + def test_disposition_filename_ascii(self): + """ + If the filename is filename= then Synapse will decode it as an + ASCII string, and use filename= in the response. + """ + channel = self._req(b"inline; filename=out.png") - # Check the cache response has the same content - self.assertEqual(channel.code, 200) + headers = channel.headers + self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"]) self.assertEqual( - channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} + headers.getRawHeaders(b"Content-Disposition"), [b"inline; filename=out.png"] ) - # Clear the in-memory cache - self.assertIn("matrix.org", self.preview_url._cache) - self.preview_url._cache.pop("matrix.org") - self.assertNotIn("matrix.org", self.preview_url._cache) - - # Check the database cache returns the correct response - request, channel = self.make_request( - "GET", "url_preview?url=matrix.org", shorthand=False + def test_disposition_filenamestar_utf8escaped(self): + """ + If the filename is filename=*utf8'' then Synapse will + correctly decode it as the UTF-8 string, and use filename* in the + response. + """ + filename = parse.quote(u"\u2603").encode('ascii') + channel = self._req(b"inline; filename*=utf-8''" + filename + b".png") + + headers = channel.headers + self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"]) + self.assertEqual( + headers.getRawHeaders(b"Content-Disposition"), + [b"inline; filename*=utf-8''" + filename + b".png"], ) - request.render(self.preview_url) - self.pump() - # Only one fetch, still, since we'll lean on the cache - self.assertEqual(len(self.fetches), 1) + def test_disposition_filename_utf8escaped(self): + """ + If the filename is `filename=` then Synapse will correctly + decode it as the UTF-8 string, but use filename* for responses. + """ + filename = parse.quote(u"\u2603").encode('ascii') + channel = self._req(b"inline; filename=" + filename + b".png") - # Check the cache response has the same content - self.assertEqual(channel.code, 200) + headers = channel.headers + self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"]) self.assertEqual( - channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} + headers.getRawHeaders(b"Content-Disposition"), + [b"inline; filename*=utf-8''" + filename + b".png"], ) diff --git a/tests/server.py b/tests/server.py index 7919a1f12498..ceec2f2d4e92 100644 --- a/tests/server.py +++ b/tests/server.py @@ -14,6 +14,8 @@ from twisted.internet.interfaces import IReactorPluggableNameResolver from twisted.python.failure import Failure from twisted.test.proto_helpers import MemoryReactorClock +from twisted.web.http import unquote +from twisted.web.http_headers import Headers from synapse.http.site import SynapseRequest from synapse.util import Clock @@ -50,6 +52,15 @@ def code(self): raise Exception("No result yet.") return int(self.result["code"]) + @property + def headers(self): + if not self.result: + raise Exception("No result yet.") + h = Headers() + for i in self.result["headers"]: + h.addRawHeader(*i) + return h + def writeHeaders(self, version, code, reason, headers): self.result["version"] = version self.result["code"] = code @@ -152,6 +163,9 @@ def make_request( path = b"/_matrix/client/r0/" + path path = path.replace(b"//", b"/") + if not path.startswith(b"/"): + path = b"/" + path + if isinstance(content, text_type): content = content.encode('utf8') @@ -161,6 +175,7 @@ def make_request( req = request(site, channel) req.process = lambda: b"" req.content = BytesIO(content) + req.postpath = list(map(unquote, path[1:].split(b'/'))) if access_token: req.requestHeaders.addRawHeader( From c15ed71022ffc5d87f70210d0771f81421117db5 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 12 Nov 2018 15:41:00 -0600 Subject: [PATCH 04/10] fix --- changelog.d/4176.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4176.bugfix diff --git a/changelog.d/4176.bugfix b/changelog.d/4176.bugfix new file mode 100644 index 000000000000..3846f8a27bbb --- /dev/null +++ b/changelog.d/4176.bugfix @@ -0,0 +1 @@ +The media repository now no longer fails to decode UTF-8 filenames when downloading remote media. From 37ab57a82712135e0a9fbb10d322e0a47ba32483 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 12 Nov 2018 16:46:24 -0600 Subject: [PATCH 05/10] remove code duplication and fix the preview API suffering a similar bug --- synapse/rest/media/v1/_base.py | 63 +++++++++++++++++++ synapse/rest/media/v1/media_repository.py | 55 +++------------- synapse/rest/media/v1/preview_url_resource.py | 30 +-------- 3 files changed, 75 insertions(+), 73 deletions(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index 76e479afa3b6..dc9f387029f5 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -13,9 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import cgi import logging import os +from six import PY3 from six.moves import urllib from twisted.internet import defer @@ -197,3 +199,64 @@ def __init__(self, server_name, file_id, url_cache=False, self.thumbnail_height = thumbnail_height self.thumbnail_method = thumbnail_method self.thumbnail_type = thumbnail_type + + +def get_filename_from_headers(headers): + """ + Get the filename of the downloaded file by inspecting the + Content-Disposition HTTP header. + + Args: + headers (twisted.web.http_headers.Headers): The HTTP + request headers. + + Returns: + A Unicode string of the filename, or None. + """ + content_disposition = headers.get(b"Content-Disposition", [b'']) + + # Decode the Content-Disposition header (cgi.parse_header requires + # unicode on Python 3, not bytes) the best we can. + try: + content_disposition = content_disposition[0].decode('utf8') + except UnicodeDecodeError: + # Wasn't valid UTF-8, therefore not valid ASCII. Give up on figuring + # out what the mess they've sent is. + content_disposition = None + + if not content_disposition: + return None + + _, params = cgi.parse_header(content_disposition) + upload_name = None + + # First check if there is a valid UTF-8 filename + upload_name_utf8 = params.get("filename*", None) + if upload_name_utf8: + if upload_name_utf8.lower().startswith("utf-8''"): + upload_name = upload_name_utf8[7:] + + # If there isn't check for an ascii name. + if not upload_name: + upload_name_ascii = params.get("filename", None) + if upload_name_ascii and is_ascii(upload_name_ascii): + upload_name = upload_name_ascii + + if not upload_name: + # We couldn't find a valid filename in the headers. + return None + + # Unquote the string + if PY3: + upload_name = urllib.parse.unquote(upload_name) + else: + # Needs to be bytes on Python 2 + upload_name = urllib.parse.unquote(upload_name.encode('utf8')) + + try: + if isinstance(upload_name, bytes): + upload_name = upload_name.decode("utf-8") + except UnicodeDecodeError: + upload_name = None + + return upload_name diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index acab80d968f8..e117836e9a7f 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -14,14 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cgi import errno import logging import os import shutil -from six import PY3, iteritems -from six.moves.urllib import parse as urlparse +from six import iteritems import twisted.internet.error import twisted.web.http @@ -38,9 +36,14 @@ from synapse.util import logcontext from synapse.util.async_helpers import Linearizer from synapse.util.retryutils import NotRetryingDestination -from synapse.util.stringutils import is_ascii, random_string +from synapse.util.stringutils import random_string -from ._base import FileInfo, respond_404, respond_with_responder +from ._base import ( + FileInfo, + get_filename_from_headers, + respond_404, + respond_with_responder, +) from .config_resource import MediaConfigResource from .download_resource import DownloadResource from .filepath import MediaFilePaths @@ -396,49 +399,9 @@ def _download_remote_file(self, server_name, media_id, file_id): yield finish() media_type = headers[b"Content-Type"][0].decode('ascii') - + upload_name = get_filename_from_headers(headers) time_now_ms = self.clock.time_msec() - content_disposition = headers.get(b"Content-Disposition", [b'']) - - # Decode the Content-Disposition header (cgi.parse_header requires - # unicode on Python 3, not bytes) the best we can. - try: - content_disposition = content_disposition[0].decode('utf8') - except UnicodeDecodeError: - # Wasn't valid UTF-8, therefore not valid ASCII. Give up on figuring - # out what the mess they've sent is. - content_disposition = None - - if content_disposition: - _, params = cgi.parse_header(content_disposition) - upload_name = None - - # First check if there is a valid UTF-8 filename - upload_name_utf8 = params.get("filename*", None) - if upload_name_utf8: - if upload_name_utf8.lower().startswith("utf-8''"): - upload_name = upload_name_utf8[7:] - - # If there isn't check for an ascii name. - if not upload_name: - upload_name_ascii = params.get("filename", None) - if upload_name_ascii and is_ascii(upload_name_ascii): - upload_name = upload_name_ascii - - if upload_name: - if PY3: - upload_name = urlparse.unquote(upload_name) - else: - upload_name = urlparse.unquote(upload_name.encode('utf8')) - try: - if isinstance(upload_name, bytes): - upload_name = upload_name.decode("utf-8") - except UnicodeDecodeError: - upload_name = None - else: - upload_name = None - logger.info("Stored remote media in file %r", fname) yield self.store.store_cached_remote_media( diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 91d1dafe647e..8bf7a115ea7d 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cgi import datetime import errno import fnmatch @@ -44,10 +43,11 @@ ) from synapse.http.servlet import parse_integer, parse_string from synapse.metrics.background_process_metrics import run_as_background_process +from synapse.rest.media.v1._base import get_filename_from_headers from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.logcontext import make_deferred_yieldable, run_in_background -from synapse.util.stringutils import is_ascii, random_string +from synapse.util.stringutils import random_string from ._base import FileInfo @@ -323,31 +323,7 @@ def _download_url(self, url, user): media_type = "application/octet-stream" time_now_ms = self.clock.time_msec() - content_disposition = headers.get(b"Content-Disposition", None) - if content_disposition: - _, params = cgi.parse_header(content_disposition[0],) - download_name = None - - # First check if there is a valid UTF-8 filename - download_name_utf8 = params.get("filename*", None) - if download_name_utf8: - if download_name_utf8.lower().startswith("utf-8''"): - download_name = download_name_utf8[7:] - - # If there isn't check for an ascii name. - if not download_name: - download_name_ascii = params.get("filename", None) - if download_name_ascii and is_ascii(download_name_ascii): - download_name = download_name_ascii - - if download_name: - download_name = urlparse.unquote(download_name) - try: - download_name = download_name.decode("utf-8") - except UnicodeDecodeError: - download_name = None - else: - download_name = None + download_name = get_filename_from_headers(headers) yield self.store.store_local_media( media_id=file_id, From bc52fe52f1522ebbc56475ac0bb173bec70b2b92 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 12 Nov 2018 16:49:09 -0600 Subject: [PATCH 06/10] fix on py2 --- tests/rest/media/v1/test_media_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py index 7841b7243685..a16f8c14b0f2 100644 --- a/tests/rest/media/v1/test_media_storage.py +++ b/tests/rest/media/v1/test_media_storage.py @@ -211,7 +211,7 @@ def test_disposition_filenamestar_utf8escaped(self): correctly decode it as the UTF-8 string, and use filename* in the response. """ - filename = parse.quote(u"\u2603").encode('ascii') + filename = parse.quote(u"\u2603".encode('utf8')).encode('ascii') channel = self._req(b"inline; filename*=utf-8''" + filename + b".png") headers = channel.headers @@ -226,7 +226,7 @@ def test_disposition_filename_utf8escaped(self): If the filename is `filename=` then Synapse will correctly decode it as the UTF-8 string, but use filename* for responses. """ - filename = parse.quote(u"\u2603").encode('ascii') + filename = parse.quote(u"\u2603".encode('utf8')).encode('ascii') channel = self._req(b"inline; filename=" + filename + b".png") headers = channel.headers From 8007cdf7970901d36d250c2e0e07aa8bc142f1b0 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 12 Nov 2018 16:49:44 -0600 Subject: [PATCH 07/10] fix packaging --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 25cdf0a61bd6..d0e49713daec 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -34,6 +34,7 @@ prune .github prune demo/etc prune docker prune .circleci +prune .coveragerc exclude jenkins* recursive-exclude jenkins *.sh From 4d42256deca33f8352d3f001a7d4b941bd642651 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Tue, 13 Nov 2018 11:00:57 -0600 Subject: [PATCH 08/10] Update synapse/rest/media/v1/_base.py Co-Authored-By: hawkowl --- synapse/rest/media/v1/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index dc9f387029f5..b8c61c3ca06d 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -208,7 +208,7 @@ def get_filename_from_headers(headers): Args: headers (twisted.web.http_headers.Headers): The HTTP - request headers. + request headers. Returns: A Unicode string of the filename, or None. From 446117d6f9aa731b94c83b64a568e9c209996358 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Tue, 13 Nov 2018 12:08:07 -0600 Subject: [PATCH 09/10] fixes --- synapse/rest/media/v1/_base.py | 120 ++++++++++++---------- tests/rest/media/v1/test_media_storage.py | 33 +++--- 2 files changed, 76 insertions(+), 77 deletions(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index dc9f387029f5..d569f9220567 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cgi import logging import os @@ -50,26 +49,21 @@ def parse_media_id(request): return server_name, media_id, file_name except Exception: raise SynapseError( - 404, - "Invalid media id token %r" % (request.postpath,), - Codes.UNKNOWN, + 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN ) def respond_404(request): respond_with_json( - request, 404, - cs_error( - "Not found %r" % (request.postpath,), - code=Codes.NOT_FOUND, - ), - send_cors=True + request, + 404, + cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND), + send_cors=True, ) @defer.inlineCallbacks -def respond_with_file(request, media_type, file_path, - file_size=None, upload_name=None): +def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None): logger.debug("Responding with %r", file_path) if os.path.isfile(file_path): @@ -99,31 +93,26 @@ def add_file_headers(request, media_type, file_size, upload_name): file_size (int): Size in bytes of the media, if known. upload_name (str): The name of the requested file, if any. """ + def _quote(x): return urllib.parse.quote(x.encode("utf-8")) request.setHeader(b"Content-Type", media_type.encode("UTF-8")) if upload_name: if is_ascii(upload_name): - disposition = ("inline; filename=%s" % (_quote(upload_name),)).encode("ascii") + disposition = "inline; filename=%s" % (_quote(upload_name),) else: - disposition = ( - "inline; filename*=utf-8''%s" % (_quote(upload_name),)).encode("ascii") + disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),) - request.setHeader(b"Content-Disposition", disposition) + request.setHeader(b"Content-Disposition", disposition.encode('ascii')) # cache for at least a day. # XXX: we might want to turn this off for data we don't want to # recommend caching as it's sensitive or private - or at least # select private. don't bother setting Expires as all our # clients are smart enough to be happy with Cache-Control - request.setHeader( - b"Cache-Control", b"public,max-age=86400,s-maxage=86400" - ) - - request.setHeader( - b"Content-Length", b"%d" % (file_size,) - ) + request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400") + request.setHeader(b"Content-Length", b"%d" % (file_size,)) @defer.inlineCallbacks @@ -155,6 +144,7 @@ class Responder(object): Responder is a context manager which *must* be used, so that any resources held can be cleaned up. """ + def write_to_consumer(self, consumer): """Stream response into consumer @@ -188,9 +178,18 @@ class FileInfo(object): thumbnail_method (str) thumbnail_type (str): Content type of thumbnail, e.g. image/png """ - def __init__(self, server_name, file_id, url_cache=False, - thumbnail=False, thumbnail_width=None, thumbnail_height=None, - thumbnail_method=None, thumbnail_type=None): + + def __init__( + self, + server_name, + file_id, + url_cache=False, + thumbnail=False, + thumbnail_width=None, + thumbnail_height=None, + thumbnail_method=None, + thumbnail_type=None, + ): self.server_name = server_name self.file_id = file_id self.url_cache = url_cache @@ -215,48 +214,55 @@ def get_filename_from_headers(headers): """ content_disposition = headers.get(b"Content-Disposition", [b'']) - # Decode the Content-Disposition header (cgi.parse_header requires - # unicode on Python 3, not bytes) the best we can. - try: - content_disposition = content_disposition[0].decode('utf8') - except UnicodeDecodeError: - # Wasn't valid UTF-8, therefore not valid ASCII. Give up on figuring - # out what the mess they've sent is. - content_disposition = None + # No header, bail out. + if not content_disposition[0]: + return - if not content_disposition: - return None + params = {} + parts = content_disposition[0].split(b";") + for i in parts: + # Split into key-value pairs, if able + if b"=" not in i: + continue + + key, value = i.strip().split(b"=") + # Store it with a decoded key and unencoded value + params[key.decode('ascii')] = value - _, params = cgi.parse_header(content_disposition) upload_name = None # First check if there is a valid UTF-8 filename upload_name_utf8 = params.get("filename*", None) if upload_name_utf8: - if upload_name_utf8.lower().startswith("utf-8''"): - upload_name = upload_name_utf8[7:] + if upload_name_utf8.lower().startswith(b"utf-8''"): + upload_name_utf8 = upload_name_utf8[7:] + if PY3: + try: + # We have a filename*= section. This MUST be ASCII, and any + # UTF-8 bytes are quoted. Once it is decoded, we can then + # unquote it strictly. + upload_name = urllib.parse.unquote( + upload_name_utf8.decode('ascii'), errors="strict" + ) + except UnicodeDecodeError: + # Incorrect UTF-8. + pass + else: + # On Python 2, we can unquote it directly, and then decode it + # strictly. + try: + upload_name = urllib.parse.unquote(upload_name_utf8).decode('utf8') + except UnicodeDecodeError: + pass # If there isn't check for an ascii name. if not upload_name: upload_name_ascii = params.get("filename", None) if upload_name_ascii and is_ascii(upload_name_ascii): - upload_name = upload_name_ascii - - if not upload_name: - # We couldn't find a valid filename in the headers. - return None - - # Unquote the string - if PY3: - upload_name = urllib.parse.unquote(upload_name) - else: - # Needs to be bytes on Python 2 - upload_name = urllib.parse.unquote(upload_name.encode('utf8')) - - try: - if isinstance(upload_name, bytes): - upload_name = upload_name.decode("utf-8") - except UnicodeDecodeError: - upload_name = None + # Make sure there's no percent-escaped bytes. If there is, reject it + # as non-valid ASCII. + if b"%" not in upload_name_ascii: + upload_name = upload_name_ascii.decode('ascii') + # This may be None here, indicating we did not find a matching name. return upload_name diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py index a16f8c14b0f2..fd131e3454fd 100644 --- a/tests/rest/media/v1/test_media_storage.py +++ b/tests/rest/media/v1/test_media_storage.py @@ -173,18 +173,15 @@ def _req(self, content_disposition): ) self.assertEqual(self.fetches[0][3], {"allow_remote": "false"}) + headers = { + b"Content-Length": [b"%d" % (len(self.end_content))], + b"Content-Type": [b'image/png'], + } + if content_disposition: + headers[b"Content-Disposition"] = [content_disposition] + self.fetches[0][0].callback( - ( - self.end_content, - ( - len(self.end_content), - { - b"Content-Length": [b"%d" % (len(self.end_content))], - b"Content-Type": [b'image/png'], - b"Content-Disposition": [content_disposition], - }, - ), - ) + (self.end_content, (len(self.end_content), headers)) ) self.pump() @@ -221,17 +218,13 @@ def test_disposition_filenamestar_utf8escaped(self): [b"inline; filename*=utf-8''" + filename + b".png"], ) - def test_disposition_filename_utf8escaped(self): + def test_disposition_none(self): """ - If the filename is `filename=` then Synapse will correctly - decode it as the UTF-8 string, but use filename* for responses. + If there is no filename, one isn't passed on in the Content-Disposition + of the request. """ - filename = parse.quote(u"\u2603".encode('utf8')).encode('ascii') - channel = self._req(b"inline; filename=" + filename + b".png") + channel = self._req(None) headers = channel.headers self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"]) - self.assertEqual( - headers.getRawHeaders(b"Content-Disposition"), - [b"inline; filename*=utf-8''" + filename + b".png"], - ) + self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None) From 92415bc597f469f0e3dfa492dd9bbd3ceeda4049 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Thu, 15 Nov 2018 15:09:29 -0600 Subject: [PATCH 10/10] fix comments --- synapse/rest/media/v1/_base.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index 83f0408aeb5b..efe42a429d76 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -218,15 +218,17 @@ def get_filename_from_headers(headers): if not content_disposition[0]: return + # dict of unicode: bytes, corresponding to the key value sections of the + # Content-Disposition header. params = {} parts = content_disposition[0].split(b";") for i in parts: # Split into key-value pairs, if able + # We don't care about things like `inline`, so throw it out if b"=" not in i: continue key, value = i.strip().split(b"=") - # Store it with a decoded key and unencoded value params[key.decode('ascii')] = value upload_name = None @@ -236,11 +238,12 @@ def get_filename_from_headers(headers): if upload_name_utf8: if upload_name_utf8.lower().startswith(b"utf-8''"): upload_name_utf8 = upload_name_utf8[7:] + # We have a filename*= section. This MUST be ASCII, and any UTF-8 + # bytes are %-quoted. if PY3: try: - # We have a filename*= section. This MUST be ASCII, and any - # UTF-8 bytes are quoted. Once it is decoded, we can then - # unquote it strictly. + # Once it is decoded, we can then unquote the %-encoded + # parts strictly into a unicode string. upload_name = urllib.parse.unquote( upload_name_utf8.decode('ascii'), errors="strict" ) @@ -248,8 +251,8 @@ def get_filename_from_headers(headers): # Incorrect UTF-8. pass else: - # On Python 2, we can unquote it directly, and then decode it - # strictly. + # On Python 2, we first unquote the %-encoded parts and then + # decode it strictly using UTF-8. try: upload_name = urllib.parse.unquote(upload_name_utf8).decode('utf8') except UnicodeDecodeError: @@ -259,8 +262,8 @@ def get_filename_from_headers(headers): if not upload_name: upload_name_ascii = params.get("filename", None) if upload_name_ascii and is_ascii(upload_name_ascii): - # Make sure there's no percent-escaped bytes. If there is, reject it - # as non-valid ASCII. + # Make sure there's no %-quoted bytes. If there is, reject it as + # non-valid ASCII. if b"%" not in upload_name_ascii: upload_name = upload_name_ascii.decode('ascii')