Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: fix on demand activation test flakyness #7180

Merged
merged 1 commit into from
Mar 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 11 additions & 9 deletions test_runner/regress/test_timeline_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
Expand Down Expand Up @@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS


def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
def condition():
assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count

wait_until(5, 1.0, condition)


def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
"""
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
Expand Down Expand Up @@ -767,10 +775,7 @@ def at_least_one_active():
# That one that we successfully accessed is now Active
expect_activated += 1
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)

# The ones we didn't touch are still in Attaching
assert (
Expand All @@ -790,10 +795,7 @@ def at_least_one_active():
== n_tenants - expect_activated
)

assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)

# When we unblock logical size calculation, all tenants should proceed to active state via
# the warmup route.
Expand All @@ -813,7 +815,7 @@ def all_active():
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
)
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)

# Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
# body of the test because it will disrupt tenant counts
Expand Down