Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add account data to export command #14969

Merged
merged 6 commits into from Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/14969.feature
@@ -0,0 +1 @@
Add account data to the command line [user data export tool](https://matrix-org.github.io/synapse/v1.78/usage/administration/admin_faq.html#how-can-i-export-user-data).
3 changes: 3 additions & 0 deletions docs/usage/administration/admin_faq.md
Expand Up @@ -71,6 +71,9 @@ output-directory
│ ├───invite_state
│ └───knock_state
└───user_data
├───account_data
│ ├───global
│ └───<room_id>
Comment on lines +74 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why not do rooms > account_data and then just have a user_data > account_data (or global_account_data)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has no special reasons. My thinking was that account data is user related rather than room related. I will change it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not anymore. The PR has now been merged.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It isn't a big deal, just curious what the thinking was!

They're related to both users & rooms, which is annoying in this case. 😄

├───connections
├───devices
└───profile
Expand Down
15 changes: 14 additions & 1 deletion synapse/app/admin_cmd.py
Expand Up @@ -17,7 +17,7 @@
import os
import sys
import tempfile
from typing import List, Optional
from typing import List, Mapping, Optional

from twisted.internet import defer, task

Expand Down Expand Up @@ -222,6 +222,19 @@ def write_connections(self, connections: List[JsonDict]) -> None:
with open(connection_file, "a") as f:
print(json.dumps(connection), file=f)

def write_account_data(
self, file_name: str, account_data: Mapping[str, JsonDict]
) -> None:
account_data_directory = os.path.join(
self.base_directory, "user_data", "account_data"
)
os.makedirs(account_data_directory, exist_ok=True)

account_data_file = os.path.join(account_data_directory, file_name)

with open(account_data_file, "a") as f:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why append mode here---is this what we typically do when exporting data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can see there are some functions that need append, because e.g. room events are added. Here in this specific case not. Ican check in each function if append or write is needed, if it makes a difference. In any case the export starts with an empty directory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't worry: I think it's okay as-is. I just wanted to understand what was going on (e.g. if we are expected to append to a previous data export).

print(json.dumps(account_data), file=f)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine here (we don't expect account_data to be large), but for general interest it might be more efficient to use json.dump(account_data, f) (see here) for large account_data blobs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't that make sense for all functions in this file?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably! But we don't need to change it here and now.


def finished(self) -> str:
return self.base_directory

Expand Down
49 changes: 34 additions & 15 deletions synapse/handlers/admin.py
Expand Up @@ -14,7 +14,7 @@

import abc
import logging
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set

from synapse.api.constants import Direction, Membership
from synapse.events import EventBase
Expand All @@ -29,7 +29,7 @@

class AdminHandler:
def __init__(self, hs: "HomeServer"):
self.store = hs.get_datastores().main
self._store = hs.get_datastores().main
self._device_handler = hs.get_device_handler()
self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state
Expand All @@ -38,7 +38,7 @@ def __init__(self, hs: "HomeServer"):
async def get_whois(self, user: UserID) -> JsonDict:
connections = []

sessions = await self.store.get_user_ip_and_agents(user)
sessions = await self._store.get_user_ip_and_agents(user)
for session in sessions:
connections.append(
{
Expand All @@ -57,7 +57,7 @@ async def get_whois(self, user: UserID) -> JsonDict:

async def get_user(self, user: UserID) -> Optional[JsonDict]:
"""Function to get user details"""
user_info_dict = await self.store.get_user_by_id(user.to_string())
user_info_dict = await self._store.get_user_by_id(user.to_string())
if user_info_dict is None:
return None

Expand Down Expand Up @@ -89,19 +89,19 @@ async def get_user(self, user: UserID) -> Optional[JsonDict]:
}

# Add additional user metadata
profile = await self.store.get_profileinfo(user.localpart)
threepids = await self.store.user_get_threepids(user.to_string())
profile = await self._store.get_profileinfo(user.localpart)
threepids = await self._store.user_get_threepids(user.to_string())
external_ids = [
({"auth_provider": auth_provider, "external_id": external_id})
for auth_provider, external_id in await self.store.get_external_ids_by_user(
for auth_provider, external_id in await self._store.get_external_ids_by_user(
user.to_string()
)
]
user_info_dict["displayname"] = profile.display_name
user_info_dict["avatar_url"] = profile.avatar_url
user_info_dict["threepids"] = threepids
user_info_dict["external_ids"] = external_ids
user_info_dict["erased"] = await self.store.is_user_erased(user.to_string())
user_info_dict["erased"] = await self._store.is_user_erased(user.to_string())

return user_info_dict

Expand All @@ -117,7 +117,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
The returned value is that returned by `writer.finished()`.
"""
# Get all rooms the user is in or has been in
rooms = await self.store.get_rooms_for_local_user_where_membership_is(
rooms = await self._store.get_rooms_for_local_user_where_membership_is(
user_id,
membership_list=(
Membership.JOIN,
Expand All @@ -131,7 +131,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
# We only try and fetch events for rooms the user has been in. If
# they've been e.g. invited to a room without joining then we handle
# those separately.
rooms_user_has_been_in = await self.store.get_rooms_user_has_been_in(user_id)
rooms_user_has_been_in = await self._store.get_rooms_user_has_been_in(user_id)

for index, room in enumerate(rooms):
room_id = room.room_id
Expand All @@ -140,7 +140,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
"[%s] Handling room %s, %d/%d", user_id, room_id, index + 1, len(rooms)
)

forgotten = await self.store.did_forget(user_id, room_id)
forgotten = await self._store.did_forget(user_id, room_id)
if forgotten:
logger.info("[%s] User forgot room %d, ignoring", user_id, room_id)
continue
Expand All @@ -152,14 +152,14 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->

if room.membership == Membership.INVITE:
event_id = room.event_id
invite = await self.store.get_event(event_id, allow_none=True)
invite = await self._store.get_event(event_id, allow_none=True)
if invite:
invited_state = invite.unsigned["invite_room_state"]
writer.write_invite(room_id, invite, invited_state)

if room.membership == Membership.KNOCK:
event_id = room.event_id
knock = await self.store.get_event(event_id, allow_none=True)
knock = await self._store.get_event(event_id, allow_none=True)
if knock:
knock_state = knock.unsigned["knock_room_state"]
writer.write_knock(room_id, knock, knock_state)
Expand All @@ -170,7 +170,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
# were joined. We estimate that point by looking at the
# stream_ordering of the last membership if it wasn't a join.
if room.membership == Membership.JOIN:
stream_ordering = self.store.get_room_max_stream_ordering()
stream_ordering = self._store.get_room_max_stream_ordering()
else:
stream_ordering = room.stream_ordering

Expand All @@ -197,7 +197,7 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
# events that we have and then filtering, this isn't the most
# efficient method perhaps but it does guarantee we get everything.
while True:
events, _ = await self.store.paginate_room_events(
events, _ = await self._store.paginate_room_events(
room_id, from_key, to_key, limit=100, direction=Direction.FORWARDS
)
if not events:
Expand Down Expand Up @@ -263,6 +263,13 @@ async def export_user_data(self, user_id: str, writer: "ExfiltrationWriter") ->
connections["devices"][""]["sessions"][0]["connections"]
)

# Get all account data the user has global and in rooms
global_data = await self._store.get_global_account_data_for_user(user_id)
by_room_data = await self._store.get_room_account_data_for_user(user_id)
writer.write_account_data("global", global_data)
for room_id in by_room_data:
writer.write_account_data(room_id, by_room_data[room_id])

return writer.finished()


Expand Down Expand Up @@ -340,6 +347,18 @@ def write_connections(self, connections: List[JsonDict]) -> None:
"""
raise NotImplementedError()

@abc.abstractmethod
def write_account_data(
self, file_name: str, account_data: Mapping[str, JsonDict]
) -> None:
"""Write the account data of a user.

Args:
file_name: file name to write data
account_data: mapping of global or room account_data
"""
raise NotImplementedError()

@abc.abstractmethod
def finished(self) -> Any:
"""Called when all data has successfully been exported and written.
Expand Down
27 changes: 27 additions & 0 deletions tests/handlers/test_admin.py
Expand Up @@ -296,3 +296,30 @@ def test_connections(self) -> None:
self.assertEqual(args[0][0]["user_agent"], "user_agent")
self.assertGreater(args[0][0]["last_seen"], 0)
self.assertNotIn("access_token", args[0][0])

def test_account_data(self) -> None:
"""Tests that user account data get exported."""
# add account data
self.get_success(
self._store.add_account_data_for_user(self.user2, "m.global", {"a": 1})
)
self.get_success(
self._store.add_account_data_to_room(
self.user2, "test_room", "m.per_room", {"b": 2}
)
)

writer = Mock()

self.get_success(self.admin_handler.export_user_data(self.user2, writer))

# two calls, one call for user data and one call for room data
writer.write_account_data.assert_called()

args = writer.write_account_data.call_args_list[0][0]
self.assertEqual(args[0], "global")
self.assertEqual(args[1]["m.global"]["a"], 1)

args = writer.write_account_data.call_args_list[1][0]
self.assertEqual(args[0], "test_room")
self.assertEqual(args[1]["m.per_room"]["b"], 2)