Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ versioning for public release policy decisions.
## [Unreleased]

<!-- core-ops-release:start -->
### Changed

- Verifier no longer fails on socket-activated services that are correctly Inactive (their listening sockets are Active and will trigger the service on demand); ConfigFile and SocketDropIn workloads no longer contribute alias entries that misattribute real service failures to their config files
<!-- core-ops-release:end -->

## [2.1.0] - 2026-05-03
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "core-ops"
version = "2.1.0"
version = "2.1.1"
edition = "2021"
license = "AGPL-3.0-or-later"

Expand Down
7 changes: 7 additions & 0 deletions changes/fix-socket-activated-verification.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
change_id: fix-socket-activated-verification
release_intent: patch
summary: Verifier no longer fails on socket-activated services that are correctly Inactive (their listening sockets are Active and will trigger the service on demand); ConfigFile and SocketDropIn workloads no longer contribute alias entries that misattribute real service failures to their config files
scope: verifier
release_preparation: false
---
14 changes: 14 additions & 0 deletions src/core/reconcile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,20 @@ fn desired_target_aliases(desired: &DesiredState) -> std::collections::BTreeMap<
desired
.workloads
.iter()
.filter(|workload| {
// ConfigFile and SocketDropIn workloads do not have their own runtime
// unit. Letting them through here would map a synthesised
// `<stem>.service` alias (from the catch-all in
// systemd_unit_for_quadlet_file) onto the config file's path or the
// drop-in's directory entry, which then steals verification results
// from the real service that owns that runtime unit. See the
// regression scenario at
// tests/fixtures/verification/scenarios/accepted-socket-activated-trigger.yaml.
!matches!(
workload.quadlet_type,
QuadletType::ConfigFile | QuadletType::SocketDropIn
)
})
.flat_map(|workload| {
let managed_id = workload.systemd_unit_name.clone();
let runtime_unit = systemd_unit_for_quadlet_file(&workload.systemd_unit_name);
Expand Down
119 changes: 112 additions & 7 deletions src/core/verify.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::core::retry::{build_retry_observation, evaluate_retry_history, RetryObservation};
use crate::core::types::{
ConvergenceStatus, DesiredState, DeterministicConvergenceRecord, MountDeclaration,
ObservedState, QuadletType, UnitActiveState, VerificationResult, VerificationStatus,
ObservedState, QuadletType, UnitActiveState, VerificationResult, VerificationStatus, Workload,
};
use crate::core::unit::systemd_unit_for_quadlet_file;
use std::collections::BTreeMap;
Expand All @@ -22,6 +22,7 @@ pub fn verify_state(desired: &DesiredState, observed: &ObservedState) -> Vec<Ver
.map(|automount_unit| (automount_unit, mount.clone()))
})
.collect();
let socket_triggers = socket_trigger_map(&desired.workloads);
desired
.workloads
.iter()
Expand All @@ -37,12 +38,97 @@ pub fn verify_state(desired: &DesiredState, observed: &ObservedState) -> Vec<Ver
&workload.systemd_unit_name,
mount_map.get(&workload.systemd_unit_name),
automount_map.get(&workload.systemd_unit_name),
&socket_triggers,
observed,
)
})
.collect()
}

/// Build a map from service unit name -> socket unit names that activate it.
///
/// A `.socket` unit's `Service=` directive (or, when absent, the default
/// `<stem>.service`) declares which service that socket activates on
/// connection. Multiple sockets can activate the same service — e.g. a Traefik
/// host with `http.socket`, `https.socket`, `traefik.socket` all targeting
/// `traefik.service`.
///
/// `Service=` is a single-valued directive: when systemd loads a base unit
/// plus its drop-ins, later assignments override earlier ones, and an empty
/// assignment resets the field to its default. Resolution here mirrors that
/// — base socket contents first, then `SocketDropIn` workloads sorted by
/// file name, taking the last non-empty assignment seen.
///
/// Used by `verify_workload` to recognise socket-activated services that are
/// correctly `Inactive` (no traffic yet) but whose listening sockets are
/// `Active`. Treating the service as failed in that state is wrong: systemd
/// will start it on first connection.
pub(crate) fn socket_trigger_map(workloads: &[Workload]) -> BTreeMap<String, Vec<String>> {
let mut map: BTreeMap<String, Vec<String>> = BTreeMap::new();
for workload in workloads {
if workload.quadlet_type != QuadletType::Socket {
continue;
}
let service = effective_socket_target_service(workload, workloads);
map.entry(service)
.or_default()
.push(workload.systemd_unit_name.clone());
}
for entries in map.values_mut() {
entries.sort();
entries.dedup();
}
map
}

/// Resolve a socket's effective `Service=` target by walking the base socket
/// contents and every `SocketDropIn` workload that lives under
/// `<socket-unit-name>.d/`, sorted lex by file name. Last non-empty
/// assignment wins; an empty assignment (`Service=`) resets to the default
/// `<stem>.service`.
fn effective_socket_target_service(socket: &Workload, all: &[Workload]) -> String {
let stem = Path::new(&socket.systemd_unit_name)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or(&socket.systemd_unit_name);
let default_target = format!("{stem}.service");
let mut current = default_target.clone();

let dropin_prefix = format!("{}.d/", socket.systemd_unit_name);
let mut dropins: Vec<&Workload> = all
.iter()
.filter(|w| {
w.quadlet_type == QuadletType::SocketDropIn
&& w.systemd_unit_name.starts_with(&dropin_prefix)
})
.collect();
dropins.sort_by(|a, b| a.systemd_unit_name.cmp(&b.systemd_unit_name));

let sources = std::iter::once(socket.quadlet_contents.as_str())
.chain(dropins.iter().map(|w| w.quadlet_contents.as_str()));

for src in sources {
for raw_line in src.lines() {
let line = raw_line.trim_start();
if line.starts_with('#') || line.starts_with(';') {
continue;
}
if let Some(value) = line
.strip_prefix("Service=")
.or_else(|| line.strip_prefix("service="))
{
let trimmed = value.trim();
current = if trimmed.is_empty() {
default_target.clone()
} else {
trimmed.to_string()
};
}
}
}
current
}

pub fn evaluate_convergence(
desired: &DesiredState,
observed: &ObservedState,
Expand Down Expand Up @@ -125,6 +211,7 @@ fn verify_workload(
unit_file: &str,
mount: Option<&MountDeclaration>,
automount: Option<&MountDeclaration>,
socket_triggers: &BTreeMap<String, Vec<String>>,
observed: &ObservedState,
) -> VerificationResult {
let unit_name = systemd_unit_for_quadlet_file(unit_file);
Expand Down Expand Up @@ -221,13 +308,31 @@ fn verify_workload(
}
(_, Some(unit)) => {
if unit.active_state == UnitActiveState::Active {
success(unit_name)
} else {
failure(
unit_name,
&format!("unit not active: {:?}", unit.active_state),
)
return success(unit_name);
}
// Socket-activated services are correctly Inactive until first
// connection. Accept Inactive when a triggering socket is Active —
// systemd will start the service on demand. A Failed service is
// never accepted, even with Active sockets, because that means the
// service started and crashed.
if unit.active_state == UnitActiveState::Inactive {
if let Some(triggers) = socket_triggers.get(&unit_name) {
let any_socket_active = triggers.iter().any(|socket_unit| {
observed
.units
.iter()
.any(|u| u.unit_name == *socket_unit
&& u.active_state == UnitActiveState::Active)
});
if any_socket_active {
return success(unit_name);
}
}
}
failure(
unit_name,
&format!("unit not active: {:?}", unit.active_state),
)
}
(_, None) => failure(unit_name, "unit not found"),
}
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/provenance_state/valid-success.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"schema_version": 1,
"controller": {
"version": "2.1.0",
"version": "2.1.1",
"revision": "8f3c2ab",
"build_time": "2026-03-23T10:00:00Z",
"tree_state": "clean"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
host: test-vm
services:
- frontend
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Regression scenario config file. Deployed to /etc/frontend/frontend.toml.
# Its filename stem ("frontend") matches the runtime unit name of the service
# ("frontend.service"), which is the alias-collision input that drove the
# original misattribution bug.
[regression]
fix = "fix-socket-activated-verification"
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=Frontend regression workload (socket-activated)
After=frontend.socket
Wants=frontend.socket

[Container]
Image=docker.io/library/caddy:2.10.2-alpine
ContainerName=frontend

[Service]
Restart=on-failure
Sockets=frontend.socket

[Install]
WantedBy=default.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[Unit]
Description=Frontend regression scenario socket trigger

[Socket]
ListenStream=18080
FileDescriptorName=frontend
Service=frontend.service

[Install]
WantedBy=sockets.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
scenario_id: accepted-socket-activated-trigger
title: Socket-activated service verifies as converged when listening socket is Active and service is Inactive
description: After an external stop of a socket-activated service, the next apply must verify as converged because the listening socket will retrigger the service on demand. Additionally, the service's failure must not be misattributed to its config file via alias collision.
scenario_classes:
- regression_detection
source: accepted
behavioral_claim: When a triggering `.socket` is Active and its target service is Inactive, post-apply verification accepts the state as converged. The verifier never re-attributes a service-level failure to a ConfigFile workload via stem-based alias collision.
rationale: Regression guard for fix-socket-activated-verification, which addresses two coupled bugs surfaced together by the matrix homeserver bring-up. (a) `verify_workload` required the runtime `.service` unit to be Active, which incorrectly fails socket-activated services that are correctly Inactive until first connection. (b) `desired_target_aliases` populated `runtime_unit -> managed_id` entries for ConfigFile workloads using the catch-all in `systemd_unit_for_quadlet_file` that synthesises `<stem>.service` from any unrecognised extension; for `/etc/frontend/frontend.toml` this collided with `frontend.service` and last-write-wins remapped the verification failure away from `frontend.container` (recoverable) onto the config file path (not recoverable), suppressing the recovery StartUnit too.
environment:
profile: single-blessed-vm
fixtures:
repo_fixture: tests/fixtures/verification/repos/socket-activated-history
revision_under_test: frontend-v1
repository_evolution:
history_fixture: tests/fixtures/verification/repos/socket-activated-history
revisions:
- frontend-v1
states:
- valid
transition_expectations:
- initial apply converges with both socket and service Active
- external systemctl stop leaves the socket Active and the service Inactive
- second apply with no source-repo changes still verifies as converged because the socket will retrigger the service on demand
- second apply does not surface a config-file failure caused by alias collision
steps:
- step_id: boot
step_type: boot
target: guest
- step_id: init
step_type: coreops_action
target: guest
action:
action: init
repository_source: fixture
revision: frontend-v1
- step_id: apply-initial
step_type: coreops_action
target: guest
action:
action: apply
host: test-vm
mode: humane
- step_id: service-active-after-initial-apply
step_type: guest_command
target: guest
command: sudo systemctl is-active frontend.service
- step_id: socket-active-after-initial-apply
step_type: guest_command
target: guest
command: sudo systemctl is-active frontend.socket
- step_id: stop-service-leaving-socket-active
step_type: guest_command
target: guest
command: sudo systemctl stop frontend.service
- step_id: confirm-service-inactive-and-socket-active
step_type: guest_command
target: guest
command: "sudo sh -lc 'test \"$(systemctl is-active frontend.service)\" = inactive && test \"$(systemctl is-active frontend.socket)\" = active'"
- step_id: apply-after-external-stop
step_type: coreops_action
target: guest
action:
action: apply
host: test-vm
mode: humane
- step_id: detect-config-file-misattribution
step_type: guest_command
target: guest
command: "sudo sh -lc '! core-ops apply --host test-vm 2>&1 | grep -F \"[!] config/etc/frontend/frontend.toml\"'"
assertions:
- assertion_id: init-succeeded
assertion_type: step_exit_code_is
target: init
expected_state: "0"
failure_message: Init did not exit successfully.
- assertion_id: apply-initial-succeeded
assertion_type: step_exit_code_is
target: apply-initial
expected_state: "0"
failure_message: Initial apply did not exit successfully.
- assertion_id: apply-initial-converged
assertion_type: step_stdout_contains
target: apply-initial
expected_state: "Outcome: converged"
failure_message: Initial apply did not report converged outcome.
- assertion_id: service-active-after-initial-apply
assertion_type: step_exit_code_is
target: service-active-after-initial-apply
expected_state: "0"
failure_message: frontend.service was not Active after initial apply (scenario precondition).
- assertion_id: socket-active-after-initial-apply
assertion_type: step_exit_code_is
target: socket-active-after-initial-apply
expected_state: "0"
failure_message: frontend.socket was not Active after initial apply (scenario precondition).
- assertion_id: confirm-precondition-state
assertion_type: step_exit_code_is
target: confirm-service-inactive-and-socket-active
expected_state: "0"
failure_message: Could not establish the Inactive-service / Active-socket precondition the regression depends on.
- assertion_id: apply-after-external-stop-succeeded
assertion_type: step_exit_code_is
target: apply-after-external-stop
expected_state: "0"
failure_message: Apply after external stop must exit successfully — a socket-activated service whose listening socket is Active should verify as converged (regression check for socket-activation acceptance in verify_workload).
- assertion_id: apply-after-external-stop-converged
assertion_type: step_stdout_contains
target: apply-after-external-stop
expected_state: "Outcome: converged"
failure_message: Apply after external stop did not report converged outcome — verifier did not accept Inactive service paired with Active socket as converged.
- assertion_id: no-config-file-misattribution
assertion_type: step_exit_code_is
target: detect-config-file-misattribution
expected_state: "0"
failure_message: Apply output contained a `[!] config/etc/frontend/frontend.toml` line — the ConfigFile workload was re-aliased onto the failed runtime unit name and inherited the failure marker (regression check for ConfigFile alias contamination in desired_target_aliases).
Loading
Loading