From e46f13bd57d98e7cdc0154a452a906013f0fc2aa Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 16 Oct 2023 13:34:50 -0400 Subject: [PATCH 1/5] Clarify comments. --- synapse/util/retryutils.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 0e1f907667ce..d5efff164ebc 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -170,10 +170,10 @@ def __init__( database in milliseconds, or zero if the last request was successful. backoff_on_404: Back off if we get a 404 - backoff_on_failure: set to False if we should not increase the retry interval on a failure. - + notifier: A notifier used to mark servers as up. + replication_client A replication client used to mark servers as up. backoff_on_all_error_codes: Whether we should back off on any error code. """ @@ -296,11 +296,7 @@ async def store_retry_timings() -> None: self.notifier.notify_remote_server_up(self.destination) if self.replication_client: - # If we're on a worker we try and inform master about this. The - # replication client doesn't hook into the notifier to avoid - # infinite loops where we send a `REMOTE_SERVER_UP` command to - # master, which then echoes it back to us which in turn pokes - # the notifier. + # Inform other workers that the remote server is up. self.replication_client.send_remote_server_up(self.destination) except Exception: From 327a2c9e18e0e06d3592cd8457acc816990de0e6 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 16 Oct 2023 13:53:46 -0400 Subject: [PATCH 2/5] Do not mark servers as up for failures. --- synapse/util/retryutils.py | 17 ++++++---- tests/util/test_retryutils.py | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index d5efff164ebc..58f0f7cd300e 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -237,6 +237,9 @@ def __exit__( else: valid_err_code = False + # Store whether the destination had previously been failing. + previously_failing = bool(self.failure_ts) + if success: # We connected successfully. if not self.retry_interval: @@ -291,13 +294,15 @@ async def store_retry_timings() -> None: self.retry_interval, ) - if self.notifier: - # Inform the relevant places that the remote server is back up. - self.notifier.notify_remote_server_up(self.destination) + # If the server was previously failing, but is no longer. + if previously_failing: + if self.notifier: + # Inform the relevant places that the remote server is back up. + self.notifier.notify_remote_server_up(self.destination) - if self.replication_client: - # Inform other workers that the remote server is up. - self.replication_client.send_remote_server_up(self.destination) + if self.replication_client: + # Inform other workers that the remote server is up. + self.replication_client.send_remote_server_up(self.destination) except Exception: logger.exception("Failed to store destination_retry_timings") diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py index 4bcd17a6fc93..6bf7e701290c 100644 --- a/tests/util/test_retryutils.py +++ b/tests/util/test_retryutils.py @@ -11,6 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + +from synapse.notifier import Notifier +from synapse.replication.tcp.handler import ReplicationCommandHandler from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from tests.unittest import HomeserverTestCase @@ -109,6 +113,61 @@ def test_limiter(self) -> None: new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) self.assertIsNone(new_timings) + def test_notifier_replcation(self) -> None: + """Ensure the notifier/replication client is called only when expected.""" + store = self.hs.get_datastores().main + + notifier = mock.Mock(spec=Notifier) + replication_client = mock.Mock(spec=ReplicationCommandHandler) + + limiter = self.get_success( + get_retry_limiter( + "test_dest", + self.clock, + store, + notifier=notifier, + replication_client=replication_client, + ) + ) + + # The server is already up, nothing should occur. + self.pump(1) + with limiter: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + self.assertIsNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # Attempt again, but return an error. This will cause new retry timings. + self.pump(1) + try: + with limiter: + raise AssertionError("argh") + except AssertionError: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNotNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # One more attempt, successfully this time. + self.pump(1) + with limiter: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNone(new_timings) + notifier.notify_remote_server_up.assert_called_once_with("test_dest") + replication_client.send_remote_server_up.assert_called_once_with("test_dest") + def test_max_retry_interval(self) -> None: """Test that `destination_max_retry_interval` setting works as expected""" store = self.hs.get_datastores().main From 7a25e70630d6f5ab675f422f41bf67eff9ea1623 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 16 Oct 2023 13:55:16 -0400 Subject: [PATCH 3/5] Newsfragment --- changelog.d/16506.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/16506.bugfix diff --git a/changelog.d/16506.bugfix b/changelog.d/16506.bugfix new file mode 100644 index 000000000000..a2c7e82b9e09 --- /dev/null +++ b/changelog.d/16506.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error. From 2da1f39b5b6400968ded2a08084332cf313438e3 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 16 Oct 2023 14:49:10 -0400 Subject: [PATCH 4/5] Additional fixes. --- synapse/util/retryutils.py | 7 +++++-- tests/util/test_retryutils.py | 20 ++++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 58f0f7cd300e..547202c96b0b 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -237,7 +237,7 @@ def __exit__( else: valid_err_code = False - # Store whether the destination had previously been failing. + # Whether previous requests to the destination had been failing. previously_failing = bool(self.failure_ts) if success: @@ -285,6 +285,9 @@ def __exit__( if self.failure_ts is None: self.failure_ts = retry_last_ts + # Whether the current request to the destination had been failing. + currently_failing = bool(self.failure_ts) + async def store_retry_timings() -> None: try: await self.store.set_destination_retry_timings( @@ -295,7 +298,7 @@ async def store_retry_timings() -> None: ) # If the server was previously failing, but is no longer. - if previously_failing: + if previously_failing and not currently_failing: if self.notifier: # Inform the relevant places that the remote server is back up. self.notifier.notify_remote_server_up(self.destination) diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py index 6bf7e701290c..1ab3b183942a 100644 --- a/tests/util/test_retryutils.py +++ b/tests/util/test_retryutils.py @@ -141,7 +141,8 @@ def test_notifier_replcation(self) -> None: notifier.notify_remote_server_up.assert_not_called() replication_client.send_remote_server_up.assert_not_called() - # Attempt again, but return an error. This will cause new retry timings. + # Attempt again, but return an error. This will cause new retry timings, but + # should not trigger server up notifications. self.pump(1) try: with limiter: @@ -156,7 +157,22 @@ def test_notifier_replcation(self) -> None: notifier.notify_remote_server_up.assert_not_called() replication_client.send_remote_server_up.assert_not_called() - # One more attempt, successfully this time. + # A second failing request should be treated as the above. + self.pump(1) + try: + with limiter: + raise AssertionError("argh") + except AssertionError: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNotNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # A final successful attempt should generate a server up notification. self.pump(1) with limiter: pass From 022c35973475f1af5ad73b97e7f0a53f4b6d2e57 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 16 Oct 2023 15:25:37 -0400 Subject: [PATCH 5/5] Typo fix. --- tests/util/test_retryutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py index 1ab3b183942a..ad88b2456670 100644 --- a/tests/util/test_retryutils.py +++ b/tests/util/test_retryutils.py @@ -113,7 +113,7 @@ def test_limiter(self) -> None: new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) self.assertIsNone(new_timings) - def test_notifier_replcation(self) -> None: + def test_notifier_replication(self) -> None: """Ensure the notifier/replication client is called only when expected.""" store = self.hs.get_datastores().main