Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add admin endpoint to query room sizes #15482

Merged
merged 4 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/15482.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add admin endpoint to query the largest rooms by disk space used in the database.
46 changes: 46 additions & 0 deletions docs/admin_api/statistics.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,49 @@ The following fields are returned in the JSON response body:
- `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
* `next_token` - integer - Opaque value used for pagination. See above.
* `total` - integer - Total number of users after filtering.


# Get largest rooms by size in database
clokep marked this conversation as resolved.
Show resolved Hide resolved

Returns the 10 largest rooms and an estimate of how much space in the database
they are taking.

This does not include the size of any associated media associated with the room.

Returns an empty list on SQLite.
clokep marked this conversation as resolved.
Show resolved Hide resolved

*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
which means that the returned information can vary widely from reality. However,
it should be enough to get a rough idea of where database disk space is going.


The API is:

```
GET /_synapse/admin/v1/statistics/statistics/database/rooms
```

A response body like the following is returned:

```json
{
"rooms": [
{
"room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
"estimated_size": 47325417353
}
],
}
```



**Response**

The following fields are returned in the JSON response body:

* `rooms` - An array of objects, sorted by largest room first. Objects contain
the following fields:
- `room_id` - string - The room ID.
- `estimated_size` - integer - Estimated disk space used in bytes by the room
in the database.
Comment on lines +128 to +129
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Err this sentence is tripping me up. Maybe?

Suggested change
- `estimated_size` - integer - Estimated disk space used in bytes by the room
in the database.
- `estimated_size` - integer - Estimated database disk space, in bytes, used by the room.

6 changes: 5 additions & 1 deletion synapse/rest/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@
RoomTimestampToEventRestServlet,
)
from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet
from synapse.rest.admin.statistics import (
LargestRoomsStatistics,
UserMediaStatisticsRestServlet,
)
from synapse.rest.admin.username_available import UsernameAvailableRestServlet
from synapse.rest.admin.users import (
AccountDataRestServlet,
Expand Down Expand Up @@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
UserRestServletV2(hs).register(http_server)
UsersRestServletV2(hs).register(http_server)
UserMediaStatisticsRestServlet(hs).register(http_server)
LargestRoomsStatistics(hs).register(http_server)
EventReportDetailRestServlet(hs).register(http_server)
EventReportsRestServlet(hs).register(http_server)
AccountDataRestServlet(hs).register(http_server)
Expand Down
25 changes: 25 additions & 0 deletions synapse/rest/admin/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,28 @@ async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
ret["next_token"] = start + len(users_media)

return HTTPStatus.OK, ret


class LargestRoomsStatistics(RestServlet):
"""Get the largest rooms by database size.

Only works when using PostgreSQL.
"""

PATTERNS = admin_patterns("/statistics/database/rooms$")

def __init__(self, hs: "HomeServer"):
self.auth = hs.get_auth()
self.stats_controller = hs.get_storage_controllers().stats

async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)

room_sizes = await self.stats_controller.get_room_db_size_estimate()

return HTTPStatus.OK, {
"rooms": [
{"room_id": room_id, "estimated_size": size}
for room_id, size in room_sizes
]
}
2 changes: 2 additions & 0 deletions synapse/storage/controllers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)
from synapse.storage.controllers.purge_events import PurgeEventsStorageController
from synapse.storage.controllers.state import StateStorageController
from synapse.storage.controllers.stats import StatsController
from synapse.storage.databases import Databases
from synapse.storage.databases.main import DataStore

Expand All @@ -40,6 +41,7 @@ def __init__(self, hs: "HomeServer", stores: Databases):

self.purge_events = PurgeEventsStorageController(hs, stores)
self.state = StateStorageController(hs, stores)
self.stats = StatsController(hs, stores)

self.persistence = None
if stores.persist_events:
Expand Down
113 changes: 113 additions & 0 deletions synapse/storage/controllers/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Copyright 2023 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from collections import Counter
from typing import TYPE_CHECKING, Collection, List, Tuple

from synapse.api.errors import SynapseError
from synapse.storage.database import LoggingTransaction
from synapse.storage.databases import Databases
from synapse.storage.engines import PostgresEngine

if TYPE_CHECKING:
from synapse.server import HomeServer

logger = logging.getLogger(__name__)


class StatsController:
"""High level interface for getting statistics."""

def __init__(self, hs: "HomeServer", stores: Databases):
self.stores = stores

async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
"""Get an estimate of the largest rooms and how much database space they
use, in bytes.

Only works against PostgreSQL.

Note: this uses the postgres statistics so is a very rough estimate.
"""

# Note: We look at both tables on the main and state databases.
if not isinstance(self.stores.main.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")

if not isinstance(self.stores.state.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")

# For each "large" table, we go through and get the largest rooms
# and an estimate of how much space they take. We can then sum the
# results and return the top 10.
#
# This isn't the most accurate, but given all of these are estimates
# anyway its good enough.
room_estimates: Counter[str] = Counter()

# Return size of the table on disk, including indexes and TOAST.
table_sql = """
SELECT pg_total_relation_size(?)
"""

# Get an estimate for the largest rooms and their frequency.
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
#
# Note: the cast here is a hack to cast from `anyarray` to an actual
# type. This ensures that psycopg2 passes us a back a a Python list.
column_sql = """
SELECT
most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
FROM pg_stats
WHERE tablename = ? and attname = 'room_id'
"""

def get_room_db_size_estimate_txn(
txn: LoggingTransaction,
tables: Collection[str],
) -> None:
for table in tables:
txn.execute(table_sql, (table,))
row = txn.fetchone()
assert row is not None
(table_size,) = row

txn.execute(column_sql, (table,))
row = txn.fetchone()
assert row is not None
vals, freqs = row

for room_id, freq in zip(vals, freqs):
room_estimates[room_id] += int(freq * table_size)

await self.stores.main.db_pool.runInteraction(
"get_room_db_size_estimate_main",
get_room_db_size_estimate_txn,
(
"event_json",
"events",
"event_search",
"event_edges",
"event_push_actions",
"stream_ordering_to_exterm",
),
)

await self.stores.state.db_pool.runInteraction(
"get_room_db_size_estimate_state",
get_room_db_size_estimate_txn,
("state_groups_state",),
)

return room_estimates.most_common(10)