From d22a0920aaf2e2c65b36e0ec34e1a8a3d22c6f52 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 7 Feb 2024 19:58:08 +0200 Subject: [PATCH] tests: try to make restored-datadir comparison tests not flaky v2 This test occasionally fails with a difference in "pg_xact/0000" file between the local and restored datadirs. My hypothesis is that something changed in the database between the last explicit checkpoint and the shutdown. I suspect autovacuum, it could certainly create transactions. To fix, be more precise about the point in time that we compare. Shut down the endpoint first, then read the last LSN (i.e. the shutdown checkpoint's LSN), from the local disk with pg_controldata. And use exactly that LSN in the basebackup. Closes #559 --- test_runner/fixtures/neon_fixtures.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7ecd7f334978..6833b60da5e6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4102,24 +4102,21 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + # Get the timeline ID. We need it for the 'basebackup' command timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) - # many tests already checkpoint, but do it just in case - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CHECKPOINT") - - # wait for pageserver to catch up - wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id) # stop postgres to ensure that files won't change endpoint.stop() + # Read the shutdown checkpoint's LSN + checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path()) + # Take a basebackup from pageserver restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"] @@ -4127,7 +4124,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint {psql_path} \ --no-psqlrc \ postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ - -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ + -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \ | tar -x -C {restored_dir_path} """