Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: log hygiene checks for storage controller #6710

Merged
merged 7 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions control_plane/attachment_service/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ use std::{
time::{Duration, Instant},
};

use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
use crate::{
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
};
use anyhow::Context;
use control_plane::storage_controller::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
Expand Down Expand Up @@ -733,7 +735,19 @@ impl Service {
tenant.waiter.advance(result.sequence);
}
Err(e) => {
tracing::warn!("Reconcile error: {}", e);
match e {
ReconcileError::Cancel => {
tracing::info!("Reconciler was cancelled");
}
ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
// This might be due to the reconciler getting cancelled, or it might
// be due to the `Node` being marked offline.
tracing::info!("Reconciler cancelled during pageserver API call");
}
_ => {
tracing::warn!("Reconcile error: {}", e);
}
}

// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
Expand Down Expand Up @@ -3631,6 +3645,13 @@ impl Service {
observed_loc.conf = None;
}

if new_nodes.len() == 1 {
// Special case for single-node cluster: there is no point trying to reschedule
// any tenant shards: avoid doing so, in order to avoid spewing warnings about
// failures to schedule them.
continue;
}

if tenant_state.intent.demote_attached(node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {
Expand Down
26 changes: 13 additions & 13 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pageserver.allowed_errors import (
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
scan_pageserver_log_for_errors,
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.types import IndexPartDump
Expand All @@ -77,6 +77,7 @@
ATTACHMENT_NAME_REGEX,
allure_add_grafana_links,
allure_attach_from_dir,
assert_no_errors,
get_self_dir,
subprocess_capture,
wait_until,
Expand Down Expand Up @@ -944,6 +945,8 @@ def __exit__(
for pageserver in self.env.pageservers:
pageserver.assert_no_errors()

self.env.storage_controller.assert_no_errors()

try:
self.overlay_cleanup_teardown()
except Exception as e:
Expand Down Expand Up @@ -1961,6 +1964,7 @@ def __init__(self, env: NeonEnv, auth_enabled: bool):
self.env = env
self.running = False
self.auth_enabled = auth_enabled
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS

def start(self):
assert not self.running
Expand All @@ -1985,6 +1989,11 @@ def raise_api_exception(res: requests.Response):
msg = ""
raise StorageControllerApiException(msg, res.status_code) from e

def assert_no_errors(self):
assert_no_errors(
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
)

def pageserver_api(self) -> PageserverHttpClient:
"""
The storage controller implements a subset of the pageserver REST API, for mapping
Expand Down Expand Up @@ -2357,18 +2366,9 @@ def workdir(self) -> Path:
return self.env.repo_dir / f"pageserver_{self.id}"

def assert_no_errors(self):
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
log.warning(f"Skipping log check: {logfile} does not exist")
return

with logfile.open("r") as f:
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
jcsp marked this conversation as resolved.
Show resolved Hide resolved

for _lineno, error in errors:
log.info(f"not allowed error: {error.strip()}")

assert not errors
assert_no_errors(
self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
)

def assert_no_metric_errors(self):
"""
Expand Down
10 changes: 10 additions & 0 deletions test_runner/fixtures/pageserver/allowed_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@ def scan_pageserver_log_for_errors(
)


DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
# Many tests will take pageservers offline, resulting in log warnings on the controller
# failing to connect to them.
".*Call to node.*management API.*failed.*receive body.*",
".*Call to node.*management API.*failed.*ReceiveBody.*",
# Many tests will start up with a node offline
".*startup_reconcile: Could not scan node.*",
]


def _check_allowed_errors(input):
allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

Expand Down
37 changes: 37 additions & 0 deletions test_runner/fixtures/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Expand Down Expand Up @@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
)

return round(total_ms, 3)


def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
error_or_warn = re.compile(r"\s(ERROR|WARN)")
errors = []
for lineno, line in enumerate(input, start=1):
if len(line) == 0:
continue

if error_or_warn.search(line):
# Is this a torn log line? This happens when force-killing a process and restarting
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
continue

# It's an ERROR or WARN. Is it in the allow-list?
for a in allowed_errors:
if re.match(a, line):
break
else:
errors.append((lineno, line))
return errors


def assert_no_errors(log_file, service, allowed_errors):
if not log_file.exists():
log.warning(f"Skipping {service} log check: {log_file} does not exist")
return

with log_file.open("r") as f:
errors = scan_log_for_errors(f, allowed_errors)

for _lineno, error in errors:
log.info(f"not allowed {service} error: {error.strip()}")

assert not errors, f"Log errors on {service}: {errors[0]}"
12 changes: 6 additions & 6 deletions test_runner/regress/test_branch_and_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
pageserver_http_client = env.pageserver.http_client()

env.pageserver.allowed_errors.extend(
[
".*invalid branch start lsn: less than latest GC cutoff.*",
".*invalid branch start lsn: less than planned GC cutoff.*",
]
)
error_regexes = [
".*invalid branch start lsn: less than latest GC cutoff.*",
".*invalid branch start lsn: less than planned GC cutoff.*",
]
env.pageserver.allowed_errors.extend(error_regexes)
env.storage_controller.allowed_errors.extend(error_regexes)

# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
tenant, _ = env.neon_cli.create_tenant(
Expand Down
9 changes: 6 additions & 3 deletions test_runner/regress/test_branch_behind.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()

env.pageserver.allowed_errors.extend(
[".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
)
error_regexes = [
".*invalid branch start lsn.*",
".*invalid start lsn .* for ancestor timeline.*",
]
env.pageserver.allowed_errors.extend(error_regexes)
env.storage_controller.allowed_errors.extend(error_regexes)

# Branch at the point where only 100 rows were inserted
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
Expand Down
4 changes: 4 additions & 0 deletions test_runner/regress/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ def test_forward_compatibility(
pg_distrib_dir=compatibility_postgres_distrib_dir,
)

# TODO: remove this workaround after release-5090 is no longer the most recent release.
# There was a bug in that code that generates a warning in the storage controller log.
env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")

# Use current neon_local even though we're using old binaries for
# everything else: our test code is written for latest CLI args.
env.neon_local_binpath = neon_local_binpath
Expand Down
14 changes: 14 additions & 0 deletions test_runner/regress/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,20 @@ def test_sharding_split_failures(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

env.storage_controller.allowed_errors.extend(
[
# All split failures log a warning when then enqueue the abort operation
".*Enqueuing background abort.*",
# We exercise failure cases where abort itself will also fail (node offline)
".*abort_tenant_shard_split.*",
".*Failed to abort.*",
# Tolerate any error lots that mention a failpoint
".*failpoint.*",
# Node offline cases will fail to send requests
".*Reconcile error: receive body: error sending request for url.*",
]
)

for ps in env.pageservers:
# When we do node failures and abandon a shard, it will de-facto have old generation and
# thereby be unable to publish remote consistent LSN updates
Expand Down
10 changes: 10 additions & 0 deletions test_runner/regress/test_sharding_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def test_node_status_after_restart(
assert len(nodes) == 2

env.pageservers[1].stop()
env.storage_controller.allowed_errors.extend([".*Could not scan node"])

env.storage_controller.stop()
env.storage_controller.start()
Expand Down Expand Up @@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
tenant_id = TenantId.generate()
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}

env.storage_controller.allowed_errors.append(".*Unauthorized.*")
env.storage_controller.allowed_errors.append(".*Forbidden.*")

# No token
with pytest.raises(
StorageControllerApiException,
Expand Down Expand Up @@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
env = neon_env_builder.init_configs()
env.start()

# Default log allow list permits connection errors, but this test will use error responses on
# the utilization endpoint.
env.storage_controller.allowed_errors.append(
".*Call to node.*management API.*failed.*failpoint.*"
)

# Initially we have two online pageservers
nodes = env.storage_controller.node_list()
assert len(nodes) == 2
Expand Down
4 changes: 3 additions & 1 deletion test_runner/regress/test_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
)
[d for d in tenants_dir.iterdir()]

neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
error_regexes = [".*tenant-config-before-write.*"]
neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)

pageserver_http = neon_simple_env.pageserver.http_client()
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
Expand Down
Loading