Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Fix a bug introduced in Synapse v1.74.0 where searching with colons w…
Browse files Browse the repository at this point in the history
…hen using ICU for search term tokenisation would fail with an error. (#15079)

Co-authored-by: David Robertson <davidr@element.io>
  • Loading branch information
reivilibre and David Robertson committed Feb 20, 2023
1 parent 7ee7f49 commit 1cbc3f1
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 5 deletions.
1 change: 1 addition & 0 deletions changelog.d/15079.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error.
24 changes: 20 additions & 4 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""
results = _parse_words(search_term)
escaped_words = []
for word in _parse_words(search_term):
# Postgres tsvector and tsquery quoting rules:
# words potentially containing punctuation should be quoted
# and then existing quotes and backslashes should be doubled
# See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY

quoted_word = word.replace("'", "''").replace("\\", "\\\\")
escaped_words.append(f"'{quoted_word}'")

both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)
both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
exact = " & ".join("%s" % (word,) for word in escaped_words)
prefix = " & ".join("%s:*" % (word,) for word in escaped_words)

return both, exact, prefix

Expand All @@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
if USE_ICU:
return _parse_words_with_icu(search_term)

return _parse_words_with_regex(search_term)


def _parse_words_with_regex(search_term: str) -> List[str]:
"""
Break down search term into words, when we don't have ICU available.
See: `_parse_words`
"""
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)


Expand Down
7 changes: 7 additions & 0 deletions tests/handlers/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,13 @@ def test_excludes_appservice_sender(self) -> None:
self.helper.join(room, self.appservice.sender, tok=self.appservice.token)
self._check_only_one_user_in_directory(user, room)

def test_search_term_with_colon_in_it_does_not_raise(self) -> None:
"""
Regression test: Test that search terms with colons in them are acceptable.
"""
u1 = self.register_user("user1", "pass")
self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10))

def test_user_not_in_users_table(self) -> None:
"""Unclear how it happens, but on matrix.org we've seen join events
for users who aren't in the users table. Test that we don't fall over
Expand Down
63 changes: 62 additions & 1 deletion tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from synapse.server import HomeServer
from synapse.storage import DataStore
from synapse.storage.background_updates import _BackgroundUpdateHandler
from synapse.storage.databases.main import user_directory
from synapse.storage.databases.main.user_directory import (
_parse_words_with_icu,
_parse_words_with_regex,
)
from synapse.storage.roommember import ProfileInfo
from synapse.util import Clock

Expand All @@ -42,7 +47,7 @@
BOB = "@bob:b"
BOBBY = "@bobby:a"
# The localpart isn't 'Bela' on purpose so we can test looking up display names.
BELA = "@somenickname:a"
BELA = "@somenickname:example.org"


class GetUserDirectoryTables:
Expand Down Expand Up @@ -423,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int:


class UserDirectoryStoreTestCase(HomeserverTestCase):
use_icu = False

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main

Expand All @@ -434,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))

self._restore_use_icu = user_directory.USE_ICU
user_directory.USE_ICU = self.use_icu

def tearDown(self) -> None:
user_directory.USE_ICU = self._restore_use_icu

def test_search_user_dir(self) -> None:
# normally when alice searches the directory she should just find
# bob because bobby doesn't share a room with her.
Expand Down Expand Up @@ -478,6 +491,26 @@ def test_search_user_dir_stop_words(self) -> None:
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_start_of_user_id(self) -> None:
"""Tests that a user can look up another user by searching for the start
of their user ID.
"""
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)


class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
use_icu = True

if not icu:
skip = "Requires PyICU"


class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
Expand Down Expand Up @@ -513,3 +546,31 @@ def test_icu_word_boundary(self) -> None:
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)

def test_icu_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the ICU tokeniser.
Seems to depend on underlying version of ICU.
"""

# Note: either tokenisation is fine, because Postgres actually splits
# words itself afterwards.
self.assertIn(
_parse_words_with_icu("lazy'fox jumped:over the.dog"),
(
# ICU 66 on Ubuntu 20.04
["lazy'fox", "jumped", "over", "the", "dog"],
# ICU 70 on Ubuntu 22.04
["lazy'fox", "jumped:over", "the.dog"],
),
)

def test_regex_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the non-ICU tokeniser
"""
self.assertEqual(
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
["lazy", "fox", "jumped", "over", "the", "dog"],
)

0 comments on commit 1cbc3f1

Please sign in to comment.