Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. #15079

Merged
merged 9 commits into from Feb 20, 2023
1 change: 1 addition & 0 deletions changelog.d/15079.bugfix
@@ -0,0 +1 @@
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error.
clokep marked this conversation as resolved.
Show resolved Hide resolved
24 changes: 20 additions & 4 deletions synapse/storage/databases/main/user_directory.py
Expand Up @@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""
results = _parse_words(search_term)
escaped_words = []
for word in _parse_words(search_term):
# Postgres tsvector and tsquery quoting rules:
# words potentially containing punctuation should be quoted
# and then existing quotes and backslashes should be doubled
# See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY

quoted_word = word.replace("'", "''").replace("\\", "\\\\")
escaped_words.append(f"'{quoted_word}'")

both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)
both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
exact = " & ".join("%s" % (word,) for word in escaped_words)
prefix = " & ".join("%s:*" % (word,) for word in escaped_words)

return both, exact, prefix

Expand All @@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
if USE_ICU:
return _parse_words_with_icu(search_term)

return _parse_words_with_regex(search_term)


def _parse_words_with_regex(search_term: str) -> List[str]:
"""
Break down search term into words, when we don't have ICU available.
See: `_parse_words`
"""
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)


Expand Down
7 changes: 7 additions & 0 deletions tests/handlers/test_user_directory.py
Expand Up @@ -192,6 +192,13 @@ def test_excludes_appservice_sender(self) -> None:
self.helper.join(room, self.appservice.sender, tok=self.appservice.token)
self._check_only_one_user_in_directory(user, room)

def test_search_term_with_colon_in_it_does_not_raise(self) -> None:
"""
Regression test: Test that search terms with colons in them are acceptable.
"""
u1 = self.register_user("user1", "pass")
self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10))

def test_user_not_in_users_table(self) -> None:
"""Unclear how it happens, but on matrix.org we've seen join events
for users who aren't in the users table. Test that we don't fall over
Expand Down
63 changes: 62 additions & 1 deletion tests/storage/test_user_directory.py
Expand Up @@ -25,6 +25,11 @@
from synapse.server import HomeServer
from synapse.storage import DataStore
from synapse.storage.background_updates import _BackgroundUpdateHandler
from synapse.storage.databases.main import user_directory
from synapse.storage.databases.main.user_directory import (
_parse_words_with_icu,
_parse_words_with_regex,
)
from synapse.storage.roommember import ProfileInfo
from synapse.util import Clock

Expand All @@ -42,7 +47,7 @@
BOB = "@bob:b"
BOBBY = "@bobby:a"
# The localpart isn't 'Bela' on purpose so we can test looking up display names.
BELA = "@somenickname:a"
BELA = "@somenickname:example.org"


class GetUserDirectoryTables:
Expand Down Expand Up @@ -423,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int:


class UserDirectoryStoreTestCase(HomeserverTestCase):
use_icu = False

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main

Expand All @@ -434,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))

self._restore_use_icu = user_directory.USE_ICU
user_directory.USE_ICU = self.use_icu

def tearDown(self) -> None:
user_directory.USE_ICU = self._restore_use_icu

def test_search_user_dir(self) -> None:
# normally when alice searches the directory she should just find
# bob because bobby doesn't share a room with her.
Expand Down Expand Up @@ -478,6 +491,26 @@ def test_search_user_dir_stop_words(self) -> None:
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_start_of_user_id(self) -> None:
"""Tests that a user can look up another user by searching for the start
of their user ID.
"""
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

Comment on lines +494 to +506
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this test (and any others) running under with-ICU and without-ICU? (Should it?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now it is :)


class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
use_icu = True

if not icu:
skip = "Requires PyICU"


class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
Expand Down Expand Up @@ -513,3 +546,31 @@ def test_icu_word_boundary(self) -> None:
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)

def test_icu_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the ICU tokeniser.

Seems to depend on underlying version of ICU.
"""

# Note: either tokenisation is fine, because Postgres actually splits
# words itself afterwards.
self.assertIn(
_parse_words_with_icu("lazy'fox jumped:over the.dog"),
(
# ICU 66 on Ubuntu 20.04
["lazy'fox", "jumped", "over", "the", "dog"],
# ICU 70 on Ubuntu 22.04
["lazy'fox", "jumped:over", "the.dog"],
),
)

def test_regex_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the non-ICU tokeniser
"""
self.assertEqual(
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
["lazy", "fox", "jumped", "over", "the", "dog"],
)