From 0ce55f5f7c37b97d9331bd77ad1a34a42b99dbd9 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 18:31:46 +0000 Subject: [PATCH 1/9] Add failing test --- tests/handlers/test_user_directory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/handlers/test_user_directory.py b/tests/handlers/test_user_directory.py index f65a68b9c2df..861345c7d5da 100644 --- a/tests/handlers/test_user_directory.py +++ b/tests/handlers/test_user_directory.py @@ -192,6 +192,13 @@ def test_excludes_appservice_sender(self) -> None: self.helper.join(room, self.appservice.sender, tok=self.appservice.token) self._check_only_one_user_in_directory(user, room) + def test_search_term_with_colon_in_it(self) -> None: + """ + Regression test: Test that search terms with colons in them are acceptable. + """ + u1 = self.register_user("user1", "pass") + self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10)) + def test_user_not_in_users_table(self) -> None: """Unclear how it happens, but on matrix.org we've seen join events for users who aren't in the users table. Test that we don't fall over From b312e6d0796e679212aa8d7dbaf4a15402376be0 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 18:39:54 +0000 Subject: [PATCH 2/9] Small rename of variables --- synapse/storage/databases/main/user_directory.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index f6a6fd4079ec..8cf8f96ee0c0 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -918,11 +918,11 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ - results = _parse_words(search_term) + words = _parse_words(search_term) - both = " & ".join("(%s:* | %s)" % (result, result) for result in results) - exact = " & ".join("%s" % (result,) for result in results) - prefix = " & ".join("%s:*" % (result,) for result in results) + both = " & ".join("(%s:* | %s)" % (word, word) for word in words) + exact = " & ".join("%s" % (word,) for word in words) + prefix = " & ".join("%s:*" % (word,) for word in words) return both, exact, prefix From 2085a4066961e37dd463e6277e213b0dc7d90ab8 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 19:40:02 +0000 Subject: [PATCH 3/9] Add minitests to characterise how we do tokenisation today --- .../storage/databases/main/user_directory.py | 8 +++++++ tests/storage/test_user_directory.py | 22 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 8cf8f96ee0c0..5914301e1093 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -944,6 +944,14 @@ def _parse_words(search_term: str) -> List[str]: if USE_ICU: return _parse_words_with_icu(search_term) + return _parse_words_with_regex(search_term) + + +def _parse_words_with_regex(search_term: str) -> List[str]: + """ + Break down search term into words, when we don't have ICU available. + See: `_parse_words` + """ return re.findall(r"([\w\-]+)", search_term, re.UNICODE) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index f1ca523d23f9..4a20e9d10ba7 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -25,6 +25,10 @@ from synapse.server import HomeServer from synapse.storage import DataStore from synapse.storage.background_updates import _BackgroundUpdateHandler +from synapse.storage.databases.main.user_directory import ( + _parse_words_with_icu, + _parse_words_with_regex, +) from synapse.storage.roommember import ProfileInfo from synapse.util import Clock @@ -513,3 +517,21 @@ def test_icu_word_boundary(self) -> None: r["results"][0], {"user_id": ALICE, "display_name": display_name, "avatar_url": None}, ) + + def test_icu_word_boundary_punctuation(self) -> None: + """ + Tests the behaviour of punctuation with the ICU tokeniser + """ + self.assertEqual( + _parse_words_with_icu("lazy'fox jumped:over the.dog"), + ["lazy'fox", "jumped:over", "the.dog"], + ) + + def test_regex_word_boundary_punctuation(self) -> None: + """ + Tests the behaviour of punctuation with the non-ICU tokeniser + """ + self.assertEqual( + _parse_words_with_regex("lazy'fox jumped:over the.dog"), + ["lazy", "fox", "jumped", "over", "the", "dog"], + ) From bc5c51153e0ccfb16e7ef405ada7b117ea34def4 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 19:40:27 +0000 Subject: [PATCH 4/9] Add escaping to our tsquery builder --- .../storage/databases/main/user_directory.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 5914301e1093..30af4b3b6cfe 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ - words = _parse_words(search_term) - - both = " & ".join("(%s:* | %s)" % (word, word) for word in words) - exact = " & ".join("%s" % (word,) for word in words) - prefix = " & ".join("%s:*" % (word,) for word in words) + escaped_words = [] + for word in _parse_words(search_term): + # Postgres tsvector and tsquery quoting rules: + # words potentially containing punctuation should be quoted + # and then existing quotes and backslashes should be doubled + # See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY + + quoted_word = word.replace("'", "''").replace("\\", "\\\\") + escaped_words.append(f"'{quoted_word}'") + + both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words) + exact = " & ".join("%s" % (word,) for word in escaped_words) + prefix = " & ".join("%s:*" % (word,) for word in escaped_words) return both, exact, prefix From a19355bce7a55782bed59a1251a55d2d2dd2a79e Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 19:50:25 +0000 Subject: [PATCH 5/9] Test that we can find a user by prefix of MXID (I wasn't super confident that the ICU tokeniser wouldn't break this, but a bit of poking later, I think I'm happy. But thought it should be tested anyway!) --- tests/storage/test_user_directory.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 4a20e9d10ba7..29f4874c2043 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -46,7 +46,7 @@ BOB = "@bob:b" BOBBY = "@bobby:a" # The localpart isn't 'Bela' on purpose so we can test looking up display names. -BELA = "@somenickname:a" +BELA = "@somenickname:example.org" class GetUserDirectoryTables: @@ -482,6 +482,19 @@ def test_search_user_dir_stop_words(self) -> None: {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_start_of_user_id(self) -> None: + """Tests that a user can look up another user by searching for the start + of their user ID. + """ + r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, + ) + class UserDirectoryICUTestCase(HomeserverTestCase): if not icu: From 30c722afe79a8ada7f1593c5f2db53404cb85e25 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Wed, 15 Feb 2023 19:53:43 +0000 Subject: [PATCH 6/9] Newsfile Signed-off-by: Olivier Wilkinson (reivilibre) --- changelog.d/15079.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/15079.bugfix diff --git a/changelog.d/15079.bugfix b/changelog.d/15079.bugfix new file mode 100644 index 000000000000..907892c1ef5f --- /dev/null +++ b/changelog.d/15079.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. \ No newline at end of file From 68d3c645d695a71d69e68c262065eaa391c9a679 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Thu, 16 Feb 2023 11:45:07 +0000 Subject: [PATCH 7/9] Loosen test to accept at least 2 versions of ICU --- tests/storage/test_user_directory.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 29f4874c2043..c02998809a0b 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -533,11 +533,21 @@ def test_icu_word_boundary(self) -> None: def test_icu_word_boundary_punctuation(self) -> None: """ - Tests the behaviour of punctuation with the ICU tokeniser + Tests the behaviour of punctuation with the ICU tokeniser. + + Seems to depend on underlying version of ICU. """ - self.assertEqual( + + # Note: either tokenisation is fine, because Postgres actually splits + # words itself afterwards. + self.assertIn( _parse_words_with_icu("lazy'fox jumped:over the.dog"), - ["lazy'fox", "jumped:over", "the.dog"], + ( + # ICU 66 on Ubuntu 20.04 + ["lazy'fox", "jumped", "over", "the", "dog"], + # ICU 70 on Ubuntu 22.04 + ["lazy'fox", "jumped:over", "the.dog"], + ), ) def test_regex_word_boundary_punctuation(self) -> None: From e5c2c78dce49c4d1475f2190184f087d751077eb Mon Sep 17 00:00:00 2001 From: reivilibre Date: Fri, 17 Feb 2023 18:14:02 +0000 Subject: [PATCH 8/9] Update tests/handlers/test_user_directory.py Co-authored-by: David Robertson --- tests/handlers/test_user_directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/handlers/test_user_directory.py b/tests/handlers/test_user_directory.py index 861345c7d5da..a02c1c62279a 100644 --- a/tests/handlers/test_user_directory.py +++ b/tests/handlers/test_user_directory.py @@ -192,7 +192,7 @@ def test_excludes_appservice_sender(self) -> None: self.helper.join(room, self.appservice.sender, tok=self.appservice.token) self._check_only_one_user_in_directory(user, room) - def test_search_term_with_colon_in_it(self) -> None: + def test_search_term_with_colon_in_it_does_not_raise(self) -> None: """ Regression test: Test that search terms with colons in them are acceptable. """ From ef374527b7c431b729243304ea0175e259bb78b6 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Fri, 17 Feb 2023 18:41:27 +0000 Subject: [PATCH 9/9] Let the tests run on both ICU and non-ICU --- tests/storage/test_user_directory.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index c02998809a0b..2d169684cf2e 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -25,6 +25,7 @@ from synapse.server import HomeServer from synapse.storage import DataStore from synapse.storage.background_updates import _BackgroundUpdateHandler +from synapse.storage.databases.main import user_directory from synapse.storage.databases.main.user_directory import ( _parse_words_with_icu, _parse_words_with_regex, @@ -427,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int: class UserDirectoryStoreTestCase(HomeserverTestCase): + use_icu = False + def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main @@ -438,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) + self._restore_use_icu = user_directory.USE_ICU + user_directory.USE_ICU = self.use_icu + + def tearDown(self) -> None: + user_directory.USE_ICU = self._restore_use_icu + def test_search_user_dir(self) -> None: # normally when alice searches the directory she should just find # bob because bobby doesn't share a room with her. @@ -496,6 +505,13 @@ def test_search_user_dir_start_of_user_id(self) -> None: ) +class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): + use_icu = True + + if not icu: + skip = "Requires PyICU" + + class UserDirectoryICUTestCase(HomeserverTestCase): if not icu: skip = "Requires PyICU"