neondatabase · VladLazar · Mar 20, 2024 · Mar 19, 2024
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
@@ -20,6 +20,7 @@
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
+def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
+    def condition():
+        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
+
+    wait_until(5, 1.0, condition)
+
+
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def at_least_one_active():
     # That one that we successfully accessed is now Active
     expect_activated += 1
     assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # The ones we didn't touch are still in Attaching
     assert (
@@ -790,10 +795,7 @@ def at_least_one_active():
         == n_tenants - expect_activated
     )
 
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # When we unblock logical size calculation, all tenants should proceed to active state via
     # the warmup route.
@@ -813,7 +815,7 @@ def all_active():
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
 
     # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts