From 99095d800965a7807cf731f5f2a2e508fb00460f Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 09:43:00 +0100 Subject: [PATCH 1/8] Add media_ids to export command --- docs/usage/administration/admin_faq.md | 33 ++++++++++++++++---------- synapse/app/admin_cmd.py | 10 ++++++++ synapse/handlers/admin.py | 27 +++++++++++++++++++++ tests/handlers/test_admin.py | 24 +++++++++++++++++++ 4 files changed, 81 insertions(+), 13 deletions(-) diff --git a/docs/usage/administration/admin_faq.md b/docs/usage/administration/admin_faq.md index 7a2774119964..ad12297661c7 100644 --- a/docs/usage/administration/admin_faq.md +++ b/docs/usage/administration/admin_faq.md @@ -70,12 +70,19 @@ output-directory │ ├───state │ ├───invite_state │ └───knock_state -└───user_data - ├───connections - ├───devices - └───profile +├───user_data +│ ├───connections +│ ├───devices +│ └───profile +└───media_ids + └─── ``` +The `media_ids` folder contains only the metadata of the media, not the media itself. +Furthermore, only the `media_ids` that synapse manages itself are exported. +If another media repository (eg. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) +is used, the data must be exported at this one. + Manually resetting passwords --- Users can reset their password through their client. Alternatively, a server admin @@ -84,7 +91,7 @@ can reset a user's password using the [admin API](../../admin_api/user_admin_api I have a problem with my server. Can I just delete my database and start again? --- -Deleting your database is unlikely to make anything better. +Deleting your database is unlikely to make anything better. It's easy to make the mistake of thinking that you can start again from a clean slate by dropping your database, but things don't work like that in a federated @@ -99,7 +106,7 @@ Come and seek help in https://matrix.to/#/#synapse:matrix.org. There are two exceptions when it might be sensible to delete your database and start again: * You have *never* joined any rooms which are federated with other servers. For -instance, a local deployment which the outside world can't talk to. +instance, a local deployment which the outside world can't talk to. * You are changing the `server_name` in the homeserver configuration. In effect this makes your server a completely new one from the point of view of the network, so in this case it makes sense to start with a clean database. @@ -112,7 +119,7 @@ Using the following curl command: curl -H 'Authorization: Bearer ' -X DELETE https://matrix.org/_matrix/client/r0/directory/room/ ``` `` - can be obtained in riot by looking in the riot settings, down the bottom is: -Access Token:\ +Access Token:\ `` - the room alias, eg. #my_room:matrix.org this possibly needs to be URL encoded also, for example %23my_room%3Amatrix.org @@ -149,13 +156,13 @@ What are the biggest rooms on my server? --- ```sql -SELECT s.canonical_alias, g.room_id, count(*) AS num_rows -FROM - state_groups_state AS g, - room_stats_state AS s -WHERE g.room_id = s.room_id +SELECT s.canonical_alias, g.room_id, count(*) AS num_rows +FROM + state_groups_state AS g, + room_stats_state AS s +WHERE g.room_id = s.room_id GROUP BY s.canonical_alias, g.room_id -ORDER BY num_rows desc +ORDER BY num_rows desc LIMIT 10; ``` diff --git a/synapse/app/admin_cmd.py b/synapse/app/admin_cmd.py index fe7afb94755e..98f8ddb9fe56 100644 --- a/synapse/app/admin_cmd.py +++ b/synapse/app/admin_cmd.py @@ -44,6 +44,7 @@ ) from synapse.storage.databases.main.events_worker import EventsWorkerStore from synapse.storage.databases.main.filtering import FilteringWorkerStore +from synapse.storage.databases.main.media_repository import MediaRepositoryStore from synapse.storage.databases.main.profile import ProfileWorkerStore from synapse.storage.databases.main.push_rule import PushRulesWorkerStore from synapse.storage.databases.main.receipts import ReceiptsWorkerStore @@ -86,6 +87,7 @@ class AdminCmdSlavedStore( RegistrationWorkerStore, RoomWorkerStore, ProfileWorkerStore, + MediaRepositoryStore, ): def __init__( self, @@ -222,6 +224,14 @@ def write_connections(self, connections: List[JsonDict]) -> None: with open(connection_file, "a") as f: print(json.dumps(connection), file=f) + def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None: + file_directory = os.path.join(self.base_directory, "media_ids") + os.makedirs(file_directory, exist_ok=True) + media_id_file = os.path.join(file_directory, media_id) + + with open(media_id_file, "w") as f: + json.dumps(media_metadata, fp=f) + def finished(self) -> str: return self.base_directory diff --git a/synapse/handlers/admin.py b/synapse/handlers/admin.py index b03c214b145a..af92e712fd48 100644 --- a/synapse/handlers/admin.py +++ b/synapse/handlers/admin.py @@ -263,6 +263,21 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> connections["devices"][""]["sessions"][0]["connections"] ) + # Get all media ids the user has + limit = 100 + start = 0 + while True: + media_ids, total = await self.store.get_local_media_by_user_paginate( + start, limit, user_id + ) + for media in media_ids: + writer.write_media_id(media.media_id, media) + + logger.info("Written %d media_ids of %s", (start + len(media_ids)), total) + if (start + limit) >= total: + break + start += limit + return writer.finished() @@ -340,6 +355,18 @@ def write_connections(self, connections: List[JsonDict]) -> None: """ raise NotImplementedError() + @abc.abstractmethod + def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None: + """Write the media's metadata of a user. + Exports only the metadata, as this can be fetched from the database via + read only. In order to access the files, a connection to the correct + media repository would be required. + + Args: + media_id: ID of the media. + media_metadata: Metadata of one media file. + """ + @abc.abstractmethod def finished(self) -> Any: """Called when all data has successfully been exported and written. diff --git a/tests/handlers/test_admin.py b/tests/handlers/test_admin.py index 6f300b8e1119..ebef8a059be2 100644 --- a/tests/handlers/test_admin.py +++ b/tests/handlers/test_admin.py @@ -296,3 +296,27 @@ def test_connections(self) -> None: self.assertEqual(args[0][0]["user_agent"], "user_agent") self.assertGreater(args[0][0]["last_seen"], 0) self.assertNotIn("access_token", args[0][0]) + + def test_media_ids(self) -> None: + """Tests that media's metadata get exported.""" + + self.get_success( + self._store.store_local_media( + media_id="media_1", + media_type="image/png", + time_now_ms=self.clock.time_msec(), + upload_name=None, + media_length=50, + user_id=self.user2, + ) + ) + self.get_success( + self._store.store_local_media( + media_id="media_2", + media_type="image/png", + time_now_ms=self.clock.time_msec(), + upload_name=None, + media_length=50, + user_id=self.user2, + ) + ) From 95fe7e75eee0362362199448896ffc2ccbe71e62 Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 11:36:15 +0100 Subject: [PATCH 2/8] add test --- synapse/handlers/admin.py | 4 ++-- tests/handlers/test_admin.py | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/synapse/handlers/admin.py b/synapse/handlers/admin.py index 549b1ebf3511..1e6d8b5c670e 100644 --- a/synapse/handlers/admin.py +++ b/synapse/handlers/admin.py @@ -274,11 +274,11 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> limit = 100 start = 0 while True: - media_ids, total = await self.store.get_local_media_by_user_paginate( + media_ids, total = await self._store.get_local_media_by_user_paginate( start, limit, user_id ) for media in media_ids: - writer.write_media_id(media.media_id, media) + writer.write_media_id(media["media_id"], media) logger.info("Written %d media_ids of %s", (start + len(media_ids)), total) if (start + limit) >= total: diff --git a/tests/handlers/test_admin.py b/tests/handlers/test_admin.py index b341909661ca..5569ccef8aef 100644 --- a/tests/handlers/test_admin.py +++ b/tests/handlers/test_admin.py @@ -23,6 +23,7 @@ from synapse.api.room_versions import RoomVersions from synapse.rest.client import knock, login, room from synapse.server import HomeServer +from synapse.types import UserID from synapse.util import Clock from tests import unittest @@ -334,16 +335,20 @@ def test_media_ids(self) -> None: time_now_ms=self.clock.time_msec(), upload_name=None, media_length=50, - user_id=self.user2, - ) - ) - self.get_success( - self._store.store_local_media( - media_id="media_2", - media_type="image/png", - time_now_ms=self.clock.time_msec(), - upload_name=None, - media_length=50, - user_id=self.user2, + user_id=UserID.from_string(self.user2), ) ) + + writer = Mock() + + self.get_success(self.admin_handler.export_user_data(self.user2, writer)) + + writer.write_media_id.assert_called_once() + + args = writer.write_media_id.call_args[0] + self.assertEqual(args[0], "media_1") + self.assertEqual(args[1]["media_id"], "media_1") + self.assertEqual(args[1]["media_length"], 50) + self.assertGreater(args[1]["created_ts"], 0) + self.assertIsNone(args[1]["upload_name"]) + self.assertIsNone(args[1]["last_access_ts"]) From 01a4fca1d538fdc4e178907c3c617042043e6127 Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 12:33:53 +0100 Subject: [PATCH 3/8] docs and logging --- docs/usage/administration/admin_faq.md | 33 ++++++++++++++++++++++++++ synapse/app/admin_cmd.py | 2 +- synapse/handlers/admin.py | 6 ++++- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/docs/usage/administration/admin_faq.md b/docs/usage/administration/admin_faq.md index d94e4494e412..1fdb94b33fba 100644 --- a/docs/usage/administration/admin_faq.md +++ b/docs/usage/administration/admin_faq.md @@ -86,6 +86,39 @@ Furthermore, only the `media_ids` that synapse manages itself are exported. If another media repository (eg. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) is used, the data must be exported at this one. +With the `media_ids` the media files can be downloaded. +Media that have been sent in encrypted rooms are only retrieved in encrypted form. +The following script can help with this: + +```bash +#!/usr/bin/env bash + +# Parameters +# +# source_directory: Directory which contains the export with the media_ids. +# target_directory: Directory into which all files are to be downloaded. +# repository_url: Address of the media repository resp. media worker +# serverName: Name of the server (`server_name` from homeserver.yaml) +# +# Example: ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.org + +source_directory=$1 +target_directory=$2 +repository_url=$3 +serverName=$4 + +mkdir -p $target_directory + +for file in $source_directory/*; do + filename=$(basename ${file}) + url=$repository_url/_matrix/media/v3/download/$serverName/$filename + echo "Downloading $filename - $url" + if ! wget -o /dev/null -P $target_directory $url; then + echo "Could not download $filename" + fi +done +``` + Manually resetting passwords --- Users can reset their password through their client. Alternatively, a server admin diff --git a/synapse/app/admin_cmd.py b/synapse/app/admin_cmd.py index ecfec448cb3f..a0e0db37766e 100644 --- a/synapse/app/admin_cmd.py +++ b/synapse/app/admin_cmd.py @@ -243,7 +243,7 @@ def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None: media_id_file = os.path.join(file_directory, media_id) with open(media_id_file, "w") as f: - json.dumps(media_metadata, fp=f) + json.dump(media_metadata, fp=f) def finished(self) -> str: return self.base_directory diff --git a/synapse/handlers/admin.py b/synapse/handlers/admin.py index 1e6d8b5c670e..d26234661fab 100644 --- a/synapse/handlers/admin.py +++ b/synapse/handlers/admin.py @@ -252,16 +252,19 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> profile = await self.get_user(UserID.from_string(user_id)) if profile is not None: writer.write_profile(profile) + logger.info("[%s] Written profile", user_id) # Get all devices the user has devices = await self._device_handler.get_devices_by_user(user_id) writer.write_devices(devices) + logger.info("[%s] Written %s devices", user_id, len(devices)) # Get all connections the user has connections = await self.get_whois(UserID.from_string(user_id)) writer.write_connections( connections["devices"][""]["sessions"][0]["connections"] ) + logger.info("[%s] Written %s connections", user_id, len(connections)) # Get all account data the user has global and in rooms global_data = await self._store.get_global_account_data_for_user(user_id) @@ -269,6 +272,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> writer.write_account_data("global", global_data) for room_id in by_room_data: writer.write_account_data(room_id, by_room_data[room_id]) + logger.info("[%s] Written account data for %s rooms", user_id, len(by_room_data)) # Get all media ids the user has limit = 100 @@ -280,7 +284,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> for media in media_ids: writer.write_media_id(media["media_id"], media) - logger.info("Written %d media_ids of %s", (start + len(media_ids)), total) + logger.info("[%s] Written %d media_ids of %s", user_id, (start + len(media_ids)), total) if (start + limit) >= total: break start += limit From 8389a6094fc0df245ea83ea960c8af502841064d Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 12:42:54 +0100 Subject: [PATCH 4/8] typos --- docs/usage/administration/admin_faq.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/usage/administration/admin_faq.md b/docs/usage/administration/admin_faq.md index 1fdb94b33fba..0d3fb0486b19 100644 --- a/docs/usage/administration/admin_faq.md +++ b/docs/usage/administration/admin_faq.md @@ -82,13 +82,13 @@ output-directory ``` The `media_ids` folder contains only the metadata of the media, not the media itself. -Furthermore, only the `media_ids` that synapse manages itself are exported. +Furthermore, only the `media_ids` that Synapse manages itself are exported. If another media repository (eg. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) is used, the data must be exported at this one. With the `media_ids` the media files can be downloaded. Media that have been sent in encrypted rooms are only retrieved in encrypted form. -The following script can help with this: +The following script can help with download the media files: ```bash #!/usr/bin/env bash @@ -97,10 +97,11 @@ The following script can help with this: # # source_directory: Directory which contains the export with the media_ids. # target_directory: Directory into which all files are to be downloaded. -# repository_url: Address of the media repository resp. media worker -# serverName: Name of the server (`server_name` from homeserver.yaml) +# repository_url: Address of the media repository resp. media worker. +# serverName: Name of the server (`server_name` from homeserver.yaml). # -# Example: ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.org +# Example: +# ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.example.com source_directory=$1 target_directory=$2 From 441501494f4312985785fc977be0e1d568fcc618 Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 12:47:02 +0100 Subject: [PATCH 5/8] newsfile --- changelog.d/15107.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/15107.feature diff --git a/changelog.d/15107.feature b/changelog.d/15107.feature new file mode 100644 index 000000000000..2bdb6a29fceb --- /dev/null +++ b/changelog.d/15107.feature @@ -0,0 +1 @@ +Add media information to the command line [user data export tool](https://matrix-org.github.io/synapse/v1.79/usage/administration/admin_faq.html#how-can-i-export-user-data). \ No newline at end of file From bae8dceacc30bf2870d8efdd6976476ccc7431c3 Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Mon, 20 Feb 2023 12:55:58 +0100 Subject: [PATCH 6/8] lint --- synapse/handlers/admin.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/synapse/handlers/admin.py b/synapse/handlers/admin.py index d26234661fab..b06f25b03c21 100644 --- a/synapse/handlers/admin.py +++ b/synapse/handlers/admin.py @@ -272,7 +272,9 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> writer.write_account_data("global", global_data) for room_id in by_room_data: writer.write_account_data(room_id, by_room_data[room_id]) - logger.info("[%s] Written account data for %s rooms", user_id, len(by_room_data)) + logger.info( + "[%s] Written account data for %s rooms", user_id, len(by_room_data) + ) # Get all media ids the user has limit = 100 @@ -284,7 +286,12 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") -> for media in media_ids: writer.write_media_id(media["media_id"], media) - logger.info("[%s] Written %d media_ids of %s", user_id, (start + len(media_ids)), total) + logger.info( + "[%s] Written %d media_ids of %s", + user_id, + (start + len(media_ids)), + total, + ) if (start + limit) >= total: break start += limit From 923e7b6b982953755fb633298fb82c8d9f8b7996 Mon Sep 17 00:00:00 2001 From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com> Date: Wed, 22 Feb 2023 21:07:25 +0100 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Patrick Cloke --- docs/usage/administration/admin_faq.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage/administration/admin_faq.md b/docs/usage/administration/admin_faq.md index 0d3fb0486b19..c972f6e36c48 100644 --- a/docs/usage/administration/admin_faq.md +++ b/docs/usage/administration/admin_faq.md @@ -83,8 +83,8 @@ output-directory The `media_ids` folder contains only the metadata of the media, not the media itself. Furthermore, only the `media_ids` that Synapse manages itself are exported. -If another media repository (eg. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) -is used, the data must be exported at this one. +If another media repository (e.g. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) +is used, the data must be exported separately. With the `media_ids` the media files can be downloaded. Media that have been sent in encrypted rooms are only retrieved in encrypted form. From ec354de3261cd20b555a2ffa37fbd5a69a33b8f2 Mon Sep 17 00:00:00 2001 From: dklimpel <5740567+dklimpel@users.noreply.github.com> Date: Thu, 23 Feb 2023 08:09:30 +0100 Subject: [PATCH 8/8] add notice to uploaded user --- docs/usage/administration/admin_faq.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/usage/administration/admin_faq.md b/docs/usage/administration/admin_faq.md index c972f6e36c48..28c3dd53a5f4 100644 --- a/docs/usage/administration/admin_faq.md +++ b/docs/usage/administration/admin_faq.md @@ -81,7 +81,8 @@ output-directory └─── ``` -The `media_ids` folder contains only the metadata of the media, not the media itself. +The `media_ids` folder contains only the metadata of the media uploaded by the user. +It does not contain the media itself. Furthermore, only the `media_ids` that Synapse manages itself are exported. If another media repository (e.g. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo)) is used, the data must be exported separately.