Skip to content

Commit

Permalink
bench: run branch_creation_many at 500, seeded (#6959)
Browse files Browse the repository at this point in the history
We have a benchmark for creating a lot of branches, but it does random
things, and the branch count is not what we is the largest maximum we
aim to support. If this PR would stabilize the benchmark total duration
it means that there are some structures which are very much slower than
others. Then we should add a seed-outputting variant to help find and
reproduce such cases.

Additionally, record for the benchmark:
- shutdown duration
- startup metrics once done (on restart)
- duration of first compaction completion via debug logging
  • Loading branch information
koivunej committed Mar 7, 2024
1 parent d3c583e commit 602a4da
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 8 deletions.
7 changes: 6 additions & 1 deletion pageserver/src/tenant/tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ pub fn start_background_loops(
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
compaction_loop(tenant, cancel)
// If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
Ok(())
Expand Down Expand Up @@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};

warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
let elapsed = started_at.elapsed();
warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);

// the duration is recorded by performance tests by enabling debug in this function
tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");

// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
Expand Down
110 changes: 103 additions & 7 deletions test_runner/performance/test_branch_creation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import random
import re
import statistics
import threading
import time
Expand All @@ -7,11 +8,14 @@
from typing import List

import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.compare_fixtures import NeonCompare
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonPageserver
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.types import Lsn
from fixtures.utils import wait_until
from prometheus_client.samples import Sample


def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]):
Expand Down Expand Up @@ -89,11 +93,17 @@ def run_pgbench(branch: str):
_record_branch_creation_durations(neon_compare, branch_creation_durations)


@pytest.mark.parametrize("n_branches", [1024])
# Test measures the latency of branch creation when creating a lot of branches.
def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
@pytest.mark.parametrize("n_branches", [500, 1024])
@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
"""
Test measures the latency of branch creation when creating a lot of branches.
"""
env = neon_compare.env

# seed the prng so we will measure the same structure every time
rng = random.Random("2024-02-29")

env.neon_cli.create_branch("b0")

endpoint = env.endpoints.create_start("b0")
Expand All @@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
branch_creation_durations = []

for i in range(n_branches):
# random a source branch
p = random.randint(0, i)
if shape == "random":
parent = f"b{rng.randint(0, i)}"
elif shape == "one_ancestor":
parent = "b0"
else:
raise RuntimeError(f"unimplemented shape: {shape}")

timer = timeit.default_timer()
env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p))
# each of these uploads to remote storage before completion
env.neon_cli.create_branch(f"b{i + 1}", parent)
dur = timeit.default_timer() - timer
branch_creation_durations.append(dur)

_record_branch_creation_durations(neon_compare, branch_creation_durations)

endpoint.stop_and_destroy()

with neon_compare.record_duration("shutdown"):
# this sleeps 100ms between polls
env.pageserver.stop()

startup_line = "INFO version: git(-env)?:"

# find the first line of the log file so we can find the next start later
_, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))

# start without gc so we can time compaction with less noise; use shorter
# period for compaction so it starts earlier
env.pageserver.start(
overrides=(
"--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
),
# this does print more than we want, but the number should be comparable between runs
extra_env_vars={
"RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
},
)

_, second_start = wait_until(
5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
)
env.pageserver.quiesce_tenants()

wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after")

# wait for compaction to complete, which most likely has already done so multiple times
msg, _ = wait_until(
30,
1,
lambda: env.pageserver.assert_log_contains(
f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
),
)
needle = re.search(" elapsed_ms=([0-9]+)", msg)
assert needle is not None, "failed to find the elapsed time"
duration = int(needle.group(1)) / 1000.0
neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER)


def wait_and_record_startup_metrics(
pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str
):
"""
Waits until all startup metrics have non-zero values on the pageserver, then records them on the target
"""

client = pageserver.http_client()

expected_labels = set(
[
"background_jobs_can_start",
"complete",
"initial",
"initial_tenant_load",
"initial_tenant_load_remote",
]
)

def metrics_are_filled() -> List[Sample]:
m = client.get_metrics()
samples = m.query_all("pageserver_startup_duration_seconds")
# we should not have duplicate labels
matching = [
x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0
]
assert len(matching) == len(expected_labels)
return matching

samples = wait_until(10, 1, metrics_are_filled)

for sample in samples:
phase = sample.labels["phase"]
name = f"{prefix}.{phase}"
target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER)


# Test measures the branch creation time when branching from a timeline with a lot of relations.
#
Expand Down

1 comment on commit 602a4da

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2570 tests run: 2435 passed, 0 failed, 135 skipped (full report)


Flaky tests (1)

Postgres 15

  • test_secondary_downloads: release

Code coverage* (full report)

  • functions: 28.8% (6993 of 24312 functions)
  • lines: 47.3% (43016 of 90878 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
602a4da at 2024-03-07T15:26:48.935Z :recycle:

Please sign in to comment.