Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Faster joins: Refactor handling of servers in room #14954

Merged
merged 4 commits into from
Feb 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/14954.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Faster room joins: Refactor internal handling of servers in room to never store an empty list.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this fixing a bug? A potential bug? Or just clean-up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's somewhere in between fixing a potential bug and a clean up. Probably closer to a clean up.

33 changes: 22 additions & 11 deletions synapse/federation/federation_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import logging
from typing import (
TYPE_CHECKING,
AbstractSet,
Awaitable,
Callable,
Collection,
Expand Down Expand Up @@ -110,8 +111,9 @@ class SendJoinResult:
# True if 'state' elides non-critical membership events
partial_state: bool

# if 'partial_state' is set, a list of the servers in the room (otherwise empty)
servers_in_room: List[str]
# If 'partial_state' is set, a set of the servers in the room (otherwise empty).
# Always contains the server we joined off.
servers_in_room: AbstractSet[str]


class FederationClient(FederationBase):
Expand Down Expand Up @@ -1152,23 +1154,32 @@ async def _execute(pdu: EventBase) -> None:
% (auth_chain_create_events,)
)

if response.members_omitted and not response.servers_in_room:
raise InvalidResponseError(
"members_omitted was set, but no servers were listed in the room"
)
servers_in_room = None
if response.servers_in_room is not None:
servers_in_room = set(response.servers_in_room)

if response.members_omitted and not partial_state:
raise InvalidResponseError(
"members_omitted was set, but we asked for full state"
)
if response.members_omitted:
if not servers_in_room:
raise InvalidResponseError(
"members_omitted was set, but no servers were listed in the room"
)

if not partial_state:
raise InvalidResponseError(
"members_omitted was set, but we asked for full state"
)
Comment on lines +1167 to +1170
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend moving this above the destination check to have all the error checking up-front.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 163c68c.


# `servers_in_room` is supposed to be a complete list.
# Fix things up in case the remote homeserver is badly behaved.
servers_in_room.add(destination)

return SendJoinResult(
event=event,
state=signed_state,
auth_chain=signed_auth,
origin=destination,
partial_state=response.members_omitted,
servers_in_room=response.servers_in_room or [],
servers_in_room=servers_in_room or frozenset(),
)

# MSC3083 defines additional error codes for room joins.
Expand Down
2 changes: 1 addition & 1 deletion synapse/federation/sender/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ async def handle_event(event: EventBase) -> None:
)
)

if len(partial_state_destinations) > 0:
if partial_state_destinations is not None:
Copy link
Contributor Author

@squahtx squahtx Jan 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially thought there was a bug here, where we would block on the full state of the room (below) when the server we joined off returned an empty list of servers in the room. But while writing this PR to fix the bug, I discovered that we validate that the list is truthy and so the bug can't happen.

destinations = partial_state_destinations

if destinations is None:
Expand Down
1 change: 1 addition & 0 deletions synapse/handlers/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@ async def handle_room_un_partial_stated(self, room_id: str) -> None:
known_hosts_at_join = await self.store.get_partial_state_servers_at_join(
room_id
)
assert known_hosts_at_join is not None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an error? Usually assertions are used for programming errors, but this seem to be input validation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assert can't (shouldn't?) ever be hit unless we've introduced a bug. This method is only run when we're about to finish syncing state. At that point the room is still partial stated, which implies we have a list of servers in it.

potentially_changed_hosts.difference_update(known_hosts_at_join)

potentially_changed_hosts.discard(self.server_name)
Expand Down
20 changes: 15 additions & 5 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@
import logging
from enum import Enum
from http import HTTPStatus
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Tuple, Union
from typing import (
TYPE_CHECKING,
AbstractSet,
Dict,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)

import attr
from prometheus_client import Histogram
Expand Down Expand Up @@ -169,7 +179,7 @@ def __init__(self, hs: "HomeServer"):
# A dictionary mapping room IDs to (initial destination, other destinations)
# tuples.
self._partial_state_syncs_maybe_needing_restart: Dict[
str, Tuple[Optional[str], StrCollection]
str, Tuple[Optional[str], AbstractSet[str]]
] = {}
# A lock guarding the partial state flag for rooms.
# When the lock is held for a given room, no other concurrent code may
Expand Down Expand Up @@ -1720,7 +1730,7 @@ async def _resume_partial_state_room_sync(self) -> None:
def _start_partial_state_room_sync(
self,
initial_destination: Optional[str],
other_destinations: StrCollection,
other_destinations: AbstractSet[str],
room_id: str,
) -> None:
"""Starts the background process to resync the state of a partial state room,
Expand Down Expand Up @@ -1802,7 +1812,7 @@ async def _sync_partial_state_room_wrapper() -> None:
async def _sync_partial_state_room(
self,
initial_destination: Optional[str],
other_destinations: StrCollection,
other_destinations: AbstractSet[str],
room_id: str,
) -> None:
"""Background process to resync the state of a partial-state room
Expand Down Expand Up @@ -1939,7 +1949,7 @@ async def _sync_partial_state_room(

def _prioritise_destinations_for_partial_state_resync(
initial_destination: Optional[str],
other_destinations: StrCollection,
other_destinations: AbstractSet[str],
room_id: str,
) -> StrCollection:
"""Work out the order in which we should ask servers to resync events.
Expand Down
3 changes: 2 additions & 1 deletion synapse/storage/controllers/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,11 @@ async def get_current_hosts_in_room_or_partial_state_approximation(
is arbitrary for rooms with partial state.
"""
# We have to read this list first to mitigate races with un-partial stating.
# This will be empty for rooms with full state.
hosts_at_join = await self.stores.main.get_partial_state_servers_at_join(
room_id
)
if hosts_at_join is None:
hosts_at_join = frozenset()

hosts_from_state = await self.stores.main.get_current_hosts_in_room(room_id)

Expand Down
50 changes: 33 additions & 17 deletions synapse/storage/databases/main/room.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
from enum import Enum
from typing import (
TYPE_CHECKING,
AbstractSet,
Any,
Awaitable,
Collection,
Dict,
List,
Mapping,
Optional,
Sequence,
Set,
Tuple,
Union,
Expand Down Expand Up @@ -108,7 +108,7 @@ class RoomSortOrder(Enum):
@attr.s(slots=True, frozen=True, auto_attribs=True)
class PartialStateResyncInfo:
joined_via: Optional[str]
servers_in_room: List[str] = attr.ib(factory=list)
servers_in_room: Set[str] = attr.ib(factory=set)


class RoomWorkerStore(CacheInvalidationWorkerStore):
Expand Down Expand Up @@ -1192,21 +1192,35 @@ def get_rooms_for_retention_period_in_range_txn(
get_rooms_for_retention_period_in_range_txn,
)

@cached(iterable=True)
async def get_partial_state_servers_at_join(self, room_id: str) -> Sequence[str]:
"""Gets the list of servers in a partial state room at the time we joined it.
async def get_partial_state_servers_at_join(
self, room_id: str
) -> Optional[AbstractSet[str]]:
"""Gets the set of servers in a partial state room at the time we joined it.

Returns:
The `servers_in_room` list from the `/send_join` response for partial state
rooms. May not be accurate or complete, as it comes from a remote
homeserver.
An empty list for full state rooms.
`None` for full state rooms.
"""
return await self.db_pool.simple_select_onecol(
"partial_state_rooms_servers",
keyvalues={"room_id": room_id},
retcol="server_name",
desc="get_partial_state_servers_at_join",
servers_in_room = await self._get_partial_state_servers_at_join(room_id)

if len(servers_in_room) == 0:
return None

return servers_in_room

@cached(iterable=True)
async def _get_partial_state_servers_at_join(
self, room_id: str
) -> AbstractSet[str]:
return frozenset(
await self.db_pool.simple_select_onecol(
"partial_state_rooms_servers",
keyvalues={"room_id": room_id},
retcol="server_name",
desc="get_partial_state_servers_at_join",
)
)

async def get_partial_state_room_resync_info(
Expand Down Expand Up @@ -1251,7 +1265,7 @@ async def get_partial_state_room_resync_info(
# partial-joined between the two SELECTs, but this is unlikely to happen
# in practice.)
continue
entry.servers_in_room.append(server_name)
entry.servers_in_room.add(server_name)

return room_servers

Expand Down Expand Up @@ -1941,7 +1955,7 @@ async def upsert_room_on_join(
async def store_partial_state_room(
self,
room_id: str,
servers: Collection[str],
servers: AbstractSet[str],
device_lists_stream_id: int,
joined_via: str,
) -> None:
Expand All @@ -1956,11 +1970,13 @@ async def store_partial_state_room(

Args:
room_id: the ID of the room
servers: other servers known to be in the room
servers: other servers known to be in the room. must include `joined_via`.
device_lists_stream_id: the device_lists stream ID at the time when we first
joined the room.
joined_via: the server name we requested a partial join from.
"""
assert joined_via in servers

await self.db_pool.runInteraction(
"store_partial_state_room",
self._store_partial_state_room_txn,
Expand All @@ -1974,7 +1990,7 @@ def _store_partial_state_room_txn(
self,
txn: LoggingTransaction,
room_id: str,
servers: Collection[str],
servers: AbstractSet[str],
device_lists_stream_id: int,
joined_via: str,
) -> None:
Expand All @@ -1997,7 +2013,7 @@ def _store_partial_state_room_txn(
)
self._invalidate_cache_and_stream(txn, self.is_partial_state_room, (room_id,))
self._invalidate_cache_and_stream(
txn, self.get_partial_state_servers_at_join, (room_id,)
txn, self._get_partial_state_servers_at_join, (room_id,)
)

async def write_partial_state_rooms_join_event_id(
Expand Down Expand Up @@ -2408,7 +2424,7 @@ def _clear_partial_state_room_txn(
)
self._invalidate_cache_and_stream(txn, self.is_partial_state_room, (room_id,))
self._invalidate_cache_and_stream(
txn, self.get_partial_state_servers_at_join, (room_id,)
txn, self._get_partial_state_servers_at_join, (room_id,)
)

DatabasePool.simple_insert_txn(
Expand Down
2 changes: 1 addition & 1 deletion tests/handlers/test_federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def test_failed_partial_join_is_clean(self) -> None:
EVENT_INVITATION_MEMBERSHIP,
],
partial_state=True,
servers_in_room=["example.com"],
servers_in_room={"example.com"},
)
)
)
Expand Down
2 changes: 1 addition & 1 deletion tests/handlers/test_room_member.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_remote_joins_contribute_to_rate_limit(self) -> None:
state=[create_event],
auth_chain=[create_event],
partial_state=False,
servers_in_room=[],
servers_in_room=frozenset(),
)
)
)
Expand Down