From b042082289b73bb2e313a0d2fcf05a0783ffa31d Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Sat, 18 Oct 2025 22:39:08 +0000 Subject: [PATCH 01/22] TQ: Async Nodes and P2P connections Builds on https://github.com/oxidecomputer/omicron/pull/9232 This is the first step in wrapping the `trust_quorum::Node` so that it can be used in an async context and integrated with sled-agent. Only the sprockets networking has been fully integrated so far such that each `NodeTask` has a `ConnMgr` that sets up a full mesh of sprockets connections. A test for this connectivity behavior has been written but the code is not wired into the production code yet. Messages can be sent between `NodeTasks` over sprockets connections. Each connection exists in it's own task managed by an `EstablishedConn`. The main `NodeTask` task sends messages to and receives messages from this task to interact with the outside world via sprockets. Currently only `Ping` messages are sent over the wire as a means to keep the connections alive and detect disconnects. A `NodeHandle` allows one to interact with the `NodeTask`. Currently only three operations are implemented with messages defined in `NodeApiRequest`. The user can instruct the node who it's peers are on the bootstrap network to establish connectivity, can poll for connectivity status, and can shutdown the node. All of this functionality is used in the accompanying test. It's important to re-iterate that this code only implements connectivity between trust quorum nodes and no actual trust quorum messages are sent. They can't be as a handle can not yet initiate a reconfiguration or LRTQ upgrade. That behavior will come in a follow up. This PR is large enough. A lot of this code is similar to the LRTQ connection management code, except that it operates over sprockets rather than TCP channels. This introduces some complexity, but it is mostly abstracted away into the `SprocketsConfig`. --- Cargo.lock | 271 +++++++++- Cargo.toml | 4 +- sled-agent/src/bootstrap/config.rs | 1 + trust-quorum/Cargo.toml | 6 + trust-quorum/src/connection_manager.rs | 717 +++++++++++++++++++++++++ trust-quorum/src/established_conn.rs | 343 ++++++++++++ trust-quorum/src/lib.rs | 8 + trust-quorum/src/task.rs | 469 ++++++++++++++++ 8 files changed, 1795 insertions(+), 24 deletions(-) create mode 100644 trust-quorum/src/connection_manager.rs create mode 100644 trust-quorum/src/established_conn.rs create mode 100644 trust-quorum/src/task.rs diff --git a/Cargo.lock b/Cargo.lock index bb5f10a683f..7b194fff992 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -536,6 +536,41 @@ dependencies = [ "thiserror 2.0.17", ] +[[package]] +name = "attest-data" +version = "0.5.0" +source = "git+https://github.com/oxidecomputer/dice-util?rev=10952e8d9599b735b85d480af3560a11700e5b64#10952e8d9599b735b85d480af3560a11700e5b64" +dependencies = [ + "const-oid", + "der", + "getrandom 0.3.4", + "hex", + "hubpack", + "rats-corim", + "salty", + "serde", + "serde_with", + "sha3", + "static_assertions", + "thiserror 2.0.17", +] + +[[package]] +name = "attest-mock" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/dice-util?rev=10952e8d9599b735b85d480af3560a11700e5b64#10952e8d9599b735b85d480af3560a11700e5b64" +dependencies = [ + "anyhow", + "attest-data 0.5.0", + "clap", + "hex", + "hubpack", + "knuffel", + "miette", + "rats-corim", + "serde_json", +] + [[package]] name = "atty" version = "0.2.14" @@ -624,6 +659,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "backtrace-ext" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537beee3be4a18fb023b570f80e3ae28003db9167a751266b259926e25539d50" +dependencies = [ + "backtrace", +] + [[package]] name = "base16ct" version = "0.2.0" @@ -1344,6 +1388,15 @@ dependencies = [ "serde", ] +[[package]] +name = "chumsky" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eebd66744a15ded14960ab4ccdbfb51ad3b81f51f3f04a80adac98c985396c9" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -1413,7 +1466,7 @@ dependencies = [ "anstyle", "clap_lex", "strsim", - "terminal_size", + "terminal_size 0.4.0", ] [[package]] @@ -2643,7 +2696,7 @@ name = "dice-verifier" version = "0.3.0-pre0" source = "git+https://github.com/oxidecomputer/dice-util?rev=4b408edc1d00f108ddf635415d783e6f12fe9641#4b408edc1d00f108ddf635415d783e6f12fe9641" dependencies = [ - "attest-data", + "attest-data 0.4.0", "const-oid", "ed25519-dalek", "env_logger", @@ -3037,14 +3090,14 @@ dependencies = [ "indent_write", "newtype_derive", "openapiv3", - "owo-colors", + "owo-colors 4.2.2", "paste", "semver 1.0.27", "serde_json", "sha2", "similar", - "supports-color", - "textwrap", + "supports-color 3.0.2", + "textwrap 0.16.2", "thiserror 2.0.17", ] @@ -4210,6 +4263,16 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "hashbrown" version = "0.15.4" @@ -4291,6 +4354,9 @@ name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +dependencies = [ + "unicode-segmentation", +] [[package]] name = "heck" @@ -5619,6 +5685,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "knuffel" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04bee6ddc6071011314b1ce4f7705fef6c009401dba4fd22cb0009db6a177413" +dependencies = [ + "base64 0.21.7", + "chumsky", + "knuffel-derive", + "miette", + "thiserror 1.0.69", + "unicode-width 0.1.14", +] + +[[package]] +name = "knuffel-derive" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91977f56c49cfb961e3d840e2e7c6e4a56bde7283898cf606861f1421348283d" +dependencies = [ + "heck 0.4.1", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "kstat-macro" version = "0.1.0" @@ -6235,6 +6328,38 @@ dependencies = [ "tokio", ] +[[package]] +name = "miette" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" +dependencies = [ + "backtrace", + "backtrace-ext", + "is-terminal", + "miette-derive", + "once_cell", + "owo-colors 3.5.0", + "supports-color 2.1.0", + "supports-hyperlinks", + "supports-unicode", + "terminal_size 0.1.17", + "textwrap 0.15.2", + "thiserror 1.0.69", + "unicode-width 0.1.14", +] + +[[package]] +name = "miette-derive" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "mime" version = "0.3.17" @@ -7371,7 +7496,7 @@ dependencies = [ "swrite", "tabled 0.15.0", "test-strategy", - "textwrap", + "textwrap 0.16.2", "thiserror 2.0.17", "tokio", "tough", @@ -8078,7 +8203,7 @@ dependencies = [ "serde", "slog", "slog-error-chain", - "textwrap", + "textwrap 0.16.2", "tokio", "uuid", ] @@ -8369,7 +8494,7 @@ dependencies = [ "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", - "owo-colors", + "owo-colors 4.2.2", "oxide-tokio-rt", "oximeter-client", "oximeter-db", @@ -8389,9 +8514,9 @@ dependencies = [ "strum 0.27.2", "subprocess", "support-bundle-viewer", - "supports-color", + "supports-color 3.0.2", "tabled 0.15.0", - "textwrap", + "textwrap 0.16.2", "tokio", "tufaceous-artifact", "unicode-width 0.1.14", @@ -8496,7 +8621,7 @@ dependencies = [ "repo-depot-client", "serde_json", "slog", - "supports-color", + "supports-color 3.0.2", "tokio", "update-engine", "uuid", @@ -9128,6 +9253,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "owo-colors" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" + [[package]] name = "owo-colors" version = "4.2.2" @@ -10141,6 +10272,36 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +[[package]] +name = "pki-playground" +version = "0.2.0" +source = "git+https://github.com/oxidecomputer/pki-playground?rev=7600756029ce046a02c6234aa84ce230cc5eaa04#7600756029ce046a02c6234aa84ce230cc5eaa04" +dependencies = [ + "camino", + "clap", + "const-oid", + "der", + "digest", + "ed25519-dalek", + "flagset", + "hex", + "ipnet", + "knuffel", + "miette", + "p384", + "pem-rfc7468", + "pkcs8", + "rand 0.8.5", + "rsa", + "sha1", + "sha2", + "sha3", + "signature", + "spki", + "x509-cert", + "zeroize", +] + [[package]] name = "plain" version = "0.2.3" @@ -11492,9 +11653,9 @@ dependencies = [ [[package]] name = "rsa" -version = "0.9.6" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +checksum = "78928ac1ed176a5ca1d17e578a1825f3d81ca54cf41053a592584b020cfd691b" dependencies = [ "const-oid", "digest", @@ -13203,10 +13364,10 @@ dependencies = [ [[package]] name = "sprockets-tls" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/sprockets.git?rev=7da1f0b5dcd3d631da18b43ba78a84b1a2b425ee#7da1f0b5dcd3d631da18b43ba78a84b1a2b425ee" +source = "git+https://github.com/oxidecomputer/sprockets.git?rev=dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210#dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210" dependencies = [ "anyhow", - "attest-data", + "attest-data 0.4.0", "camino", "cfg-if", "clap", @@ -13233,6 +13394,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "sprockets-tls-test-utils" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/sprockets.git?rev=dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210#dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210" +dependencies = [ + "camino", + "pki-playground", +] + [[package]] name = "sqlformat" version = "0.3.5" @@ -13505,6 +13675,16 @@ dependencies = [ "zip 4.2.0", ] +[[package]] +name = "supports-color" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6398cde53adc3c4557306a96ce67b302968513830a77a95b2b17305d9719a89" +dependencies = [ + "is-terminal", + "is_ci", +] + [[package]] name = "supports-color" version = "3.0.2" @@ -13514,6 +13694,24 @@ dependencies = [ "is_ci", ] +[[package]] +name = "supports-hyperlinks" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84231692eb0d4d41e4cdd0cabfdd2e6cd9e255e65f80c9aa7c98dd502b4233d" +dependencies = [ + "is-terminal", +] + +[[package]] +name = "supports-unicode" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f850c19edd184a205e883199a261ed44471c81e39bd95b1357f5febbef00e77a" +dependencies = [ + "is-terminal", +] + [[package]] name = "swrite" version = "0.1.0" @@ -13763,6 +13961,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "terminal_size" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "terminal_size" version = "0.4.0" @@ -13810,6 +14018,17 @@ dependencies = [ "unicode-width 0.2.0", ] +[[package]] +name = "textwrap" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7b3e525a49ec206798b40326a44121291b530c963cfb01018f63e135bac543d" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width 0.1.14", +] + [[package]] name = "textwrap" version = "0.16.2" @@ -13817,7 +14036,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" dependencies = [ "smawk", - "terminal_size", + "terminal_size 0.4.0", "unicode-linebreak", "unicode-width 0.2.0", ] @@ -14590,13 +14809,17 @@ version = "0.1.0" dependencies = [ "anyhow", "assert_matches", + "attest-mock", "bcs", "bootstore", + "bytes", "camino", "chacha20poly1305", + "ciborium", "daft", "derive_more 0.99.20", "dropshot", + "futures", "gfss", "hex", "hkdf", @@ -14614,6 +14837,8 @@ dependencies = [ "sled-agent-types", "slog", "slog-error-chain", + "sprockets-tls", + "sprockets-tls-test-utils", "static_assertions", "subtle", "test-strategy", @@ -15123,14 +15348,14 @@ dependencies = [ "linear-map", "omicron-test-utils", "omicron-workspace-hack", - "owo-colors", + "owo-colors 4.2.2", "petgraph 0.8.2", "schemars 0.8.22", "serde", "serde_json", "serde_with", "slog", - "supports-color", + "supports-color 3.0.2", "swrite", "tokio", "tokio-stream", @@ -15660,7 +15885,7 @@ dependencies = [ "omicron-common", "omicron-passwords", "omicron-workspace-hack", - "owo-colors", + "owo-colors 4.2.2", "proptest", "ratatui", "reqwest", @@ -15674,9 +15899,9 @@ dependencies = [ "slog-async", "slog-envlogger", "slog-term", - "supports-color", + "supports-color 3.0.2", "tempfile", - "textwrap", + "textwrap 0.16.2", "tokio", "tokio-util", "toml 0.8.23", @@ -15703,7 +15928,7 @@ dependencies = [ "maplit", "omicron-common", "omicron-workspace-hack", - "owo-colors", + "owo-colors 4.2.2", "oxnet", "schemars 0.8.22", "serde", @@ -16373,7 +16598,7 @@ dependencies = [ "serde", "swrite", "tabled 0.15.0", - "textwrap", + "textwrap 0.16.2", "toml 0.8.23", "usdt 0.5.0", ] diff --git a/Cargo.toml b/Cargo.toml index 513abc0577b..c52388ef37f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -370,6 +370,7 @@ assert_matches = "1.5.0" assert_cmd = "2.0.17" async-bb8-diesel = "0.2" async-trait = "0.1.89" +attest-mock = { git = "https://github.com/oxidecomputer/dice-util", rev = "10952e8d9599b735b85d480af3560a11700e5b64" } atomicwrites = "0.4.4" authz-macros = { path = "nexus/authz-macros" } backoff = { version = "0.4.0", features = [ "tokio" ] } @@ -724,7 +725,8 @@ slog-term = "2.9.1" smf = "0.2" socket2 = { version = "0.5", features = ["all"] } sp-sim = { path = "sp-sim" } -sprockets-tls = { git = "https://github.com/oxidecomputer/sprockets.git", rev = "7da1f0b5dcd3d631da18b43ba78a84b1a2b425ee" } +sprockets-tls = { git = "https://github.com/oxidecomputer/sprockets.git", rev = "dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210" } +sprockets-tls-test-utils = { git = "https://github.com/oxidecomputer/sprockets.git", rev = "dea3bbfac7d9d3c45f088898fcd05ee5d2ec2210" } sqlformat = "0.3.5" sqlparser = { version = "0.45.0", features = [ "visitor" ] } static_assertions = "1.1.0" diff --git a/sled-agent/src/bootstrap/config.rs b/sled-agent/src/bootstrap/config.rs index 6833cb76071..3b6b5e3e443 100644 --- a/sled-agent/src/bootstrap/config.rs +++ b/sled-agent/src/bootstrap/config.rs @@ -7,3 +7,4 @@ pub const BOOTSTRAP_AGENT_HTTP_PORT: u16 = 80; pub const BOOTSTRAP_AGENT_RACK_INIT_PORT: u16 = 12346; pub const BOOTSTORE_PORT: u16 = 12347; +pub const TRUST_QUORUM_PORT: u16 = 12349; diff --git a/trust-quorum/Cargo.toml b/trust-quorum/Cargo.toml index 0b0dfefb0fe..7a034f78f8d 100644 --- a/trust-quorum/Cargo.toml +++ b/trust-quorum/Cargo.toml @@ -11,10 +11,13 @@ workspace = true anyhow.workspace = true bcs.workspace = true bootstore.workspace = true +bytes.workspace = true camino.workspace = true chacha20poly1305.workspace = true +ciborium.workspace = true daft.workspace = true derive_more.workspace = true +futures.workspace = true gfss.workspace = true hex.workspace = true hkdf.workspace = true @@ -28,6 +31,7 @@ sha3.workspace = true sled-agent-types.workspace = true slog.workspace = true slog-error-chain.workspace = true +sprockets-tls.workspace = true static_assertions.workspace = true subtle.workspace = true thiserror.workspace = true @@ -38,12 +42,14 @@ omicron-workspace-hack.workspace = true [dev-dependencies] assert_matches.workspace = true +attest-mock.workspace = true dropshot.workspace = true omicron-test-utils.workspace = true proptest.workspace = true serde_json.workspace = true test-strategy.workspace = true trust-quorum-test-utils.workspace = true +sprockets-tls-test-utils.workspace = true [features] # Impl `PartialEq` and `Eq` for types implementing `subtle::ConstantTimeEq` when diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs new file mode 100644 index 00000000000..4d24e365148 --- /dev/null +++ b/trust-quorum/src/connection_manager.rs @@ -0,0 +1,717 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A mechanism for maintaining a full mesh of trust quorum node connections + +use crate::established_conn::EstablishedConn; +use crate::{BaseboardId, PeerMsg}; +// TODO: Move or copy this to this crate? +use bootstore::schemes::v0::NetworkConfig; +use camino::Utf8PathBuf; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use serde::{Deserialize, Serialize}; +use slog::{Logger, debug, error, info, o, warn}; +use slog_error_chain::SlogInlineError; +use sprockets_tls::keys::SprocketsConfig; +use sprockets_tls::server::SprocketsAcceptor; +use std::collections::{BTreeMap, BTreeSet}; +use std::net::{SocketAddr, SocketAddrV4, SocketAddrV6}; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::task::JoinHandle; +use tokio::time::{MissedTickBehavior, interval}; + +/// We only expect a handful of concurrent requests at most. +const CHANNEL_BOUND: usize = 10; + +// Time between checks to see if we need to reconnect to to any peers +const RECONNECT_TIME: Duration = Duration::from_secs(5); + +/// An error returned from `ConnMgr::accept` +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub enum AcceptError { + #[error("Accepted connection from IPv4 address {addr}. Only IPv6 allowed.")] + Ipv4Accept { addr: SocketAddrV4 }, + + #[error("sprockets error")] + Sprockets( + #[from] + #[source] + sprockets_tls::Error, + ), +} + +/// A mechanism for uniquely identifying a task managing a connection +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct TaskId(u64); + +impl TaskId { + pub fn new(id: u64) -> TaskId { + TaskId(id) + } + + /// Increment the ID and then return the value before the increment + pub fn inc(&mut self) -> TaskId { + let id = *self; + self.0 += 1; + id + } +} + +/// Messages sent from the main task to the connection managing tasks +#[derive(Debug, PartialEq)] +pub enum MainToConnMsg { + Close, + #[allow(unused)] + Msg(WireMsg), +} + +/// All possible messages sent over established connections +/// +/// This include trust quorum related `PeerMsg`s, but also ancillary network +/// messages used for other purposes. +/// +/// All `WireMsg`s sent between nodes is prefixed with a 4 byte size header used +/// for framing. +#[derive(Debug, PartialEq, Serialize, Deserialize)] +pub enum WireMsg { + /// Used for connection keep alive + Ping, + /// Trust quorum peer messages + Tq(PeerMsg), + /// Early network configuration to enable NTP timesync + /// + /// Technically this is not part of the trust quorum protocol. However it is + /// necessary to gossip this information to all nodes on the system so that + /// each can establish NTP sync required for the rest of the control plane + /// to boot. In short, we can't have rack unlock without this information, + /// even if we can decrypt the drives. For simplicity, we just piggyback + /// this information on the trust quorum connections. This is why the + /// implementation of LRTQ lived inside the `bootstore` directory in the + /// `omicron` repo. This is technically an eventually consistent database + /// of tiny information layered on top of trust quorum. You can still think + /// of it as a bootstore, although, we no longer use that name. + NetworkConfig(NetworkConfig), +} + +/// Messages sent from connection managing tasks to the main peer task +/// +/// We include `task_id` to differentiate which task they come from so we can +/// exclude requests from tasks that have been cancelled or have been told to +/// shutdown. +#[derive(Debug, PartialEq)] +pub struct ConnToMainMsg { + pub task_id: TaskId, + pub msg: ConnToMainMsgInner, +} + +#[derive(Debug, PartialEq)] +pub enum ConnToMainMsgInner { + Accepted { addr: SocketAddrV6, peer_id: BaseboardId }, + Connected { addr: SocketAddrV6, peer_id: BaseboardId }, + Received { from: BaseboardId, msg: PeerMsg }, + ReceivedNetworkConfig { from: BaseboardId, config: NetworkConfig }, + Disconnected { peer_id: BaseboardId }, +} + +pub struct TaskHandle { + pub task_id: TaskId, + pub tx: mpsc::Sender, + pub conn_type: ConnectionType, +} + +impl TaskHandle { + pub fn addr(&self) -> SocketAddrV6 { + self.conn_type.addr() + } +} + +pub enum ConnectionType { + Connected(SocketAddrV6), + Accepted(SocketAddrV6), +} + +impl ConnectionType { + pub fn addr(&self) -> SocketAddrV6 { + match self { + Self::Connected(addr) => *addr, + Self::Accepted(addr) => *addr, + } + } +} + +#[derive(Debug, Clone)] +pub enum ConnState { + Connecting, + Accepting, + Established(BaseboardId), +} + +/// Information about a single connection task +#[derive(Debug, Clone)] +pub struct ConnInfo { + pub state: ConnState, + pub addr: SocketAddrV6, + pub task_id: TaskId, +} + +/// Status information useful for debugging +#[derive(Debug, Clone)] +pub struct ConnMgrStatus { + pub bootstrap_addrs: BTreeSet, + pub connections: Vec, + pub num_task_join_handles: u64, + pub next_task_id: TaskId, +} + +/// A structure to manage all sprockets connections to peer nodes +/// +/// Each sprockets connection runs in its own task which communicates with the +/// main `NodeTask`. All methods on the `ConnMgr` run inside the main `NodeTask` +/// as `ConnMgr` is a member field of `NodeTask`. This allows isolating the +/// connection management logic from the main node message handling logic +/// without adding yet another task. +pub struct ConnMgr { + log: Logger, + + /// A channel for sending messages from a connection task to the main task + main_tx: mpsc::Sender, + + /// The sprockets config + config: SprocketsConfig, + + /// The sprockets server + server: sprockets_tls::Server, + + /// The address the sprockets server listens on + listen_addr: SocketAddrV6, + + // A unique, monotonically incrementing id for each task to help map tasks + // to their handles in case the task aborts, or there is a new connection + // accepted and established for an existing `BaseboardId`. + next_task_id: TaskId, + + /// `JoinHandle`s to all tasks that can be polled for crashes + join_handles: FuturesUnordered>, + + /// All known addresses on the bootstrap network, learned via DDMD + bootstrap_addrs: BTreeSet, + + /// All tasks currently connecting to remote nodes and attempting a + /// sprockets handshake. + connecting: BTreeMap, + + /// All tasks with an accepted TCP connnection performing a sprockets handshake + accepting: BTreeMap, + + /// All tasks containing established connections that can be used to communicate + /// with other nodes. + established: BTreeMap, +} + +impl ConnMgr { + pub async fn new( + log: &Logger, + mut listen_addr: SocketAddrV6, + sprockets_config: SprocketsConfig, + main_tx: mpsc::Sender, + ) -> ConnMgr { + let log = log.new(o!("component" => "trust-quorum-conn-mgr")); + + let config = sprockets_config.clone(); + let server = sprockets_tls::Server::new( + sprockets_config, + listen_addr, + log.clone(), + ) + .await + .expect("sprockets server can listen"); + + // If the listen port was 0, we want to update our addr to use + // the actual port This is really only useful for testing, but the + // connection manager won't work properly without doing this because it + // will never trigger connections since its own address will always sort + // lower than other addresses if only the ports differ. + let listen_port = server.listen_addr().unwrap().port(); + + if listen_port != listen_addr.port() { + listen_addr.set_port(listen_port); + } + + info!( + log, + "Started listening"; + "local_addr" => %listen_addr + ); + + ConnMgr { + log, + main_tx, + config, + server, + listen_addr, + next_task_id: TaskId::new(0), + join_handles: Default::default(), + bootstrap_addrs: BTreeSet::new(), + connecting: BTreeMap::new(), + accepting: BTreeMap::new(), + established: BTreeMap::new(), + } + } + + pub async fn shutdown(&mut self) { + // Shutdown all connection processing tasks + for (_, handle) in &self.accepting { + let _ = handle.tx.send(MainToConnMsg::Close).await; + } + for (_, handle) in &self.connecting { + let _ = handle.tx.send(MainToConnMsg::Close).await; + } + for (_, handle) in &self.established { + let _ = handle.tx.send(MainToConnMsg::Close).await; + } + } + + pub fn status(&self) -> ConnMgrStatus { + let connections = self + .connecting + .iter() + .map(|(addr, task_handle)| ConnInfo { + state: ConnState::Connecting, + addr: *addr, + task_id: task_handle.task_id, + }) + .chain(self.accepting.iter().map(|(addr, task_handle)| ConnInfo { + state: ConnState::Accepting, + addr: *addr, + task_id: task_handle.task_id, + })) + .chain(self.established.iter().map( + |(baseboard_id, task_handle)| ConnInfo { + state: ConnState::Established(baseboard_id.clone()), + addr: task_handle.addr(), + task_id: task_handle.task_id, + }, + )) + .collect(); + + ConnMgrStatus { + bootstrap_addrs: self.bootstrap_addrs.clone(), + connections, + num_task_join_handles: self.join_handles.len() as u64, + next_task_id: self.next_task_id, + } + } + + pub fn listen_addr(&self) -> SocketAddrV6 { + self.listen_addr + } + + /// Perform any polling related operations that the connection + /// manager must perform concurrently. + pub async fn step( + &mut self, + corpus: Vec, + ) -> Result<(), AcceptError> { + let mut interval = interval(RECONNECT_TIME); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + + loop { + tokio::select! { + acceptor = self.server.accept(corpus.clone()) => { + self.accept(acceptor?).await?; + } + Some(res) = self.join_handles.next() => { + match res { + Ok(task_id) => { + self.on_task_exit(task_id).await; + } + Err(err) => { + error!(self.log, "Connection task panic: {}", err); + } + + } + } + _ = interval.tick() => { + self.reconnect(corpus.clone()).await; + } + } + } + } + + pub async fn accept( + &mut self, + acceptor: SprocketsAcceptor, + ) -> Result<(), AcceptError> { + let addr = match acceptor.addr() { + SocketAddr::V4(addr) => { + return Err(AcceptError::Ipv4Accept { addr }); + } + SocketAddr::V6(addr) => addr, + }; + let log = self.log.clone(); + let task_id = self.next_task_id.inc(); + let (tx, rx) = mpsc::channel(CHANNEL_BOUND); + let task_handle = TaskHandle { + task_id, + tx, + conn_type: ConnectionType::Accepted(addr), + }; + let main_tx = self.main_tx.clone(); + let join_handle = tokio::spawn(async move { + match acceptor.handshake().await { + Ok((stream, _)) => { + let platform_id = + stream.peer_platform_id().as_str().unwrap(); + let baseboard_id = platform_id_to_baseboard_id(platform_id); + + // TODO: Conversion between `PlatformId` and `BaseboardId` should + // happen in `sled-agent-types`. This is waiting on an update + // to the `dice-mfg-msgs` crate. + let log = + log.new(o!("peer_id" => baseboard_id.to_string())); + info!(log, "Accepted sprockets connection"; "addr" => %addr); + + let mut conn = EstablishedConn::new( + baseboard_id.clone(), + task_id, + stream, + main_tx.clone(), + rx, + &log, + ); + + // Inform the main task that accepted connection is established + if let Err(e) = main_tx + .send(ConnToMainMsg { + task_id: task_id, + msg: ConnToMainMsgInner::Accepted { + addr, + peer_id: baseboard_id, + }, + }) + .await + { + // The system is shutting down + // Just bail from this task + warn!( + log, + "Failed to send 'accepted' msg to main task: {e:?}" + ); + } else { + conn.run().await; + } + } + Err(err) => { + error!(log, "Failed to accept a connection"; &err); + } + } + task_id + }); + self.join_handles.push(join_handle); + self.accepting.insert(addr, task_handle); + Ok(()) + } + + pub async fn server_handshake_completed( + &mut self, + task_id: TaskId, + addr: SocketAddrV6, + peer_id: BaseboardId, + ) { + if let Some(task_handle) = self.accepting.remove(&addr) { + info!( + self.log, + "Established server connection"; + "task_id" => ?task_id, + "remote_addr" => %addr, + "remote_peer_id" => peer_id.to_string() + ); + let already_established = + self.established.insert(peer_id, task_handle); + assert!(already_established.is_none()); + } + } + + pub async fn client_handshake_completed( + &mut self, + task_id: TaskId, + addr: SocketAddrV6, + peer_id: BaseboardId, + ) { + if let Some(task_handle) = self.connecting.remove(&addr) { + info!( + self.log, + "Established client connection"; + "task_id" => ?task_id, + "remote_addr" => %addr, + "remote_peer_id" => peer_id.to_string() + ); + let already_established = + self.established.insert(peer_id, task_handle); + + assert!(already_established.is_none()); + } + } + + /// The established connection task has asynchronously exited. + pub async fn on_disconnected( + &mut self, + task_id: TaskId, + peer_id: BaseboardId, + ) { + if let Some(task_handle) = self.established.get(&peer_id) { + if task_handle.task_id != task_id { + // This was a stale disconnect + return; + } + } + warn!(self.log, "peer disconnected"; "peer_id" => %peer_id); + let _ = self.established.remove(&peer_id); + } + + /// Initiate connections if a corresponding task doesn't already exist. This + /// must be called periodically to handle transient disconnections which + /// cause tasks to exit. + pub async fn reconnect(&mut self, corpus: Vec) { + debug!(self.log, "Reconnect called"); + let mut to_connect = vec![]; + for addr in + self.bootstrap_addrs.iter().filter(|&&addr| self.listen_addr > addr) + { + if self.connecting.contains_key(addr) { + continue; + } + + if self + .established + .values() + .any(|task_handle| task_handle.addr() == *addr) + { + continue; + } + + to_connect.push(addr.clone()); + } + + for addr in to_connect { + // We don't have an existing connection + self.connect_client(corpus.clone(), addr).await + } + } + + /// The set of known addresses on the bootstrap network has changed + /// + /// We need to connect to peers with addresses less than our own + /// and tear down any connections that no longer exist in `addrs`. + pub async fn update_bootstrap_connections( + &mut self, + addrs: BTreeSet, + corpus: Vec, + ) { + if self.bootstrap_addrs == addrs { + return; + } + + // We don't try to compare addresses from accepted nodes. If DDMD + // loses an accepting address we assume that the connection will go + // away soon, if it hasn't already. We can't compare without an extra + // handshake message to identify the listen address of the remote + // connection because clients use ephemeral ports. We always compare + // on the full `SocketAddrV6` which includes the port, which helps when + // testing on localhost. + let to_connect: BTreeSet<_> = addrs + .difference(&self.bootstrap_addrs) + .filter(|&&addr| self.listen_addr > addr) + .cloned() + .collect(); + let to_disconnect: BTreeSet<_> = self + .bootstrap_addrs + .difference(&addrs) + .filter(|&&addr| self.listen_addr > addr) + .cloned() + .collect(); + + self.bootstrap_addrs = addrs; + + for addr in to_connect { + self.connect_client(corpus.clone(), addr).await; + } + + for addr in to_disconnect { + self.disconnect_client(addr).await; + } + } + + /// Spawn a task to estalbish a sprockets connection for the given address + async fn connect_client( + &mut self, + corpus: Vec, + addr: SocketAddrV6, + ) { + let task_id = self.next_task_id.inc(); + let (tx, rx) = mpsc::channel(CHANNEL_BOUND); + let task_handle = TaskHandle { + task_id, + tx, + conn_type: ConnectionType::Connected(addr), + }; + info!(self.log, "Initiating connection to new peer: {addr}"); + let main_tx = self.main_tx.clone(); + let log = self.log.clone(); + let config = self.config.clone(); + let join_handle = tokio::spawn(async move { + match sprockets_tls::Client::connect( + config, + addr, + corpus.clone(), + log.clone(), + ) + .await + { + Ok(stream) => { + let platform_id = + stream.peer_platform_id().as_str().unwrap(); + let baseboard_id = platform_id_to_baseboard_id(platform_id); + + // TODO: Conversion between `PlatformId` and `BaseboardId` should + // happen in `sled-agent-types`. This is waiting on an update + // to the `dice-mfg-msgs` crate. + let log = + log.new(o!("peer_id" => baseboard_id.to_string())); + info!(log, "Sprockets connection established"; "addr" => %addr); + + let mut conn = EstablishedConn::new( + baseboard_id.clone(), + task_id, + stream, + main_tx.clone(), + rx, + &log, + ); + // Inform the main task that the client connection is + // established. + if let Err(e) = main_tx + .send(ConnToMainMsg { + task_id: task_id, + msg: ConnToMainMsgInner::Connected { + addr, + peer_id: baseboard_id, + }, + }) + .await + { + // The system is shutting down + // Just bail from this task + error!( + log, + "Failed to send 'connected' msg to main task: {e:?}" + ); + } else { + conn.run().await; + } + } + Err(err) => { + warn!(log, "Failed to connect"; &err); + } + } + task_id + }); + self.join_handles.push(join_handle); + self.connecting.insert(addr, task_handle); + } + + /// Remove any information about a sprockets client connection and inform + /// the corresponding task to stop. + /// + /// We don't tear down server connections this way as we don't know their + /// listen port, just the ephemeral port. + async fn disconnect_client(&mut self, addr: SocketAddrV6) { + if let Some(handle) = self.connecting.remove(&addr) { + // The connection has not yet completed its handshake + info!( + self.log, + "Deleting initiating connection"; + "remote_addr" => addr.to_string() + ); + let _ = handle.tx.send(MainToConnMsg::Close).await; + } else { + if let Some((id, handle)) = self + .established + .iter() + .find(|(_, handle)| handle.addr() == addr) + { + info!( + self.log, + "Deleting established connection"; + "remote_addr" => addr.to_string(), + "remote_peer_id" => id.to_string(), + ); + let _ = handle.tx.send(MainToConnMsg::Close).await; + // probably a better way to avoid borrowck issues + let id = id.clone(); + self.established.remove(&id); + } + } + } + + /// Remove any references to the given task + async fn on_task_exit(&mut self, task_id: TaskId) { + // We're most likely to find the task as established so we start with that + if let Some((id, handle)) = self + .established + .iter() + .find(|(_, handle)| handle.task_id == task_id) + { + info!( + self.log, + "Established connection task exited"; + "task_id" => ?task_id, + "remote_addr" => handle.addr().to_string(), + "remote_peer_id" => id.to_string(), + ); + // probably a better way to avoid borrowck issues + let id = id.clone(); + self.established.remove(&id); + } else if let Some((addr, handle)) = + self.accepting.iter().find(|(_, handle)| handle.task_id == task_id) + { + info!( + self.log, + "Accepting task exited"; + "task_id" => ?task_id, + "remote_addr" => handle.addr().to_string(), + ); + let addr = *addr; + self.accepting.remove(&addr); + } else if let Some((addr, handle)) = + self.connecting.iter().find(|(_, handle)| handle.task_id == task_id) + { + info!( + self.log, + "Connecting task exited"; + "task_id" => ?task_id, + "remote_addr" => handle.addr().to_string(), + ); + let addr = *addr; + self.connecting.remove(&addr); + } else { + info!( + self.log, + "Task exited. No cleanup required."; + "task_id" => ?task_id + ); + } + } +} + +// TODO: Eventually this will go away, once we pull in and use the latest +// `dice-util` code. +pub fn platform_id_to_baseboard_id(platform_id: &str) -> BaseboardId { + let mut platform_id_iter = platform_id.split(":"); + let part_number = platform_id_iter.nth(1).unwrap().to_string(); + let serial_number = platform_id_iter.skip(1).next().unwrap().to_string(); + BaseboardId { part_number, serial_number } +} diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs new file mode 100644 index 00000000000..01a04bd9e76 --- /dev/null +++ b/trust-quorum/src/established_conn.rs @@ -0,0 +1,343 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An individual sprockets connection running in its own task + +use crate::{ + BaseboardId, ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, TaskId, + WireMsg, +}; +use bytes::Buf; +use serde::Serialize; +use slog::{Logger, debug, error, o, warn}; +use slog_error_chain::SlogInlineError; +use std::collections::VecDeque; +use std::io::Cursor; +use std::time::Duration; +use tokio::io::{AsyncReadExt, AsyncWriteExt, ReadHalf, WriteHalf, split}; +use tokio::net::TcpStream; +use tokio::sync::mpsc; +use tokio::time::{Instant, MissedTickBehavior, interval}; + +/// Max buffer size of a connection +const CONN_BUF_SIZE: usize = 1024 * 1024; + +/// Each message starts with a 4 bytes size header +const FRAME_HEADER_SIZE: usize = 4; + +/// The number of serialized messages to queue for writing before closing the socket. +/// This means the remote side is very slow. +/// +/// TODO: Alternatively we could drop the oldest message. +const MSG_WRITE_QUEUE_CAPACITY: usize = 5; + +// Timing parameters for keeping the connection healthy +const PING_INTERVAL: Duration = Duration::from_secs(1); + +/// The time limit for not receiving a complete message from a peer. +/// The connection is shutdown after this time. +const INACTIVITY_TIMEOUT: Duration = Duration::from_secs(10); + +/// An error from within an `EstablishedConn` that triggers connection close +/// +/// Also a great movie +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub enum ConnErr { + #[error("Main task insructed this connection to close")] + Close, + #[error("Failed to write")] + FailedWrite(#[source] std::io::Error), + #[error("Failed to read")] + FailedRead(#[source] std::io::Error), + #[error("Failed to deserialize wire message")] + DeserializeWireMsg(#[from] ciborium::de::Error), + #[error("Failed to serialize wire message")] + SerializeWireMsg(#[from] ciborium::ser::Error), + #[error("Write queue filled with serialized messages")] + WriteQueueFull, + #[error("Inactivity timeout")] + InactivityTimeout, +} + +/// Container for code running in its own task per sprockets connection +pub struct EstablishedConn { + peer_id: BaseboardId, + task_id: TaskId, + reader: ReadHalf>, + writer: WriteHalf>, + main_tx: mpsc::Sender, + rx: mpsc::Receiver, + log: Logger, + + // Buffer we read raw data into from a sprockets connection + read_buf: Box<[u8]>, + + // The amount of data currently in `read_buf` + total_read: usize, + + // Used for managing inactivity timeouts for the connection + last_received_msg: Instant, + + // Keep a queue to write serialized messages into. We limit the queue + // size, and if it gets exceeded it means the peer at the other + // end isn't pulling data out fast enough. This should be basically + // impossible to hit given the size and rate of message exchange + // between peers. We go ahead and close the connection if the queue + // fills. + write_queue: VecDeque>, + + // The current serialized message being written if there is one + current_write: Cursor>, +} + +impl EstablishedConn { + pub fn new( + peer_id: BaseboardId, + task_id: TaskId, + stream: sprockets_tls::Stream, + main_tx: mpsc::Sender, + rx: mpsc::Receiver, + log: &Logger, + ) -> EstablishedConn { + let log = log.new(o!("component" => "trust-quorum-established-conn")); + let (reader, writer) = split(stream); + EstablishedConn { + peer_id, + task_id, + reader, + writer, + main_tx, + rx, + log, + read_buf: vec![0u8; CONN_BUF_SIZE].into_boxed_slice(), + total_read: 0, + last_received_msg: Instant::now(), + write_queue: VecDeque::with_capacity(MSG_WRITE_QUEUE_CAPACITY), + current_write: Cursor::new(Vec::new()), + } + } + + pub async fn run(&mut self) { + let mut interval = interval(PING_INTERVAL); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + + // This is the main loop of the connection + // + // Continuously process messages until the connection closes + loop { + if !self.current_write.has_remaining() + && !self.write_queue.is_empty() + { + self.current_write = + Cursor::new(self.write_queue.pop_front().unwrap()); + } + + let res = tokio::select! { + _ = interval.tick() => { + self.ping().await + } + Some(msg) = self.rx.recv() => { + self.on_msg_from_main(msg).await + } + res = self.reader.read(&mut self.read_buf[self.total_read..]) => { + self.on_read(res).await + } + res = self.writer.write_buf(&mut self.current_write), + if self.current_write.has_remaining() => + { + self.check_write_result(res).await + } + }; + + if let Err(err) = res { + warn!(self.log, "Closing connection"; &err); + self.close().await; + return; + } + } + } + + async fn close(&mut self) { + if let Err(e) = self + .main_tx + .send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::Disconnected { + peer_id: self.peer_id.clone(), + }, + }) + .await + { + warn!(self.log, "Failed to send to main task: {e:?}"); + } + // TODO: This causes a deadlock and breaks the test. + // + // I'm unclear why, although I plan to dig a bit further in the future. + // It should be noted that the writer and reader share a std::mutex + // under the hood and that could be causing issues. Regardless, it + // is not actually critical to issue a shutdown as detection will + // be discovered via missing ping messages at the other end of the + // connection. + // + // let _ = self.writer.shutdown().await; + } + + async fn on_read( + &mut self, + res: Result, + ) -> Result<(), ConnErr> { + match res { + Ok(n) => { + self.total_read += n; + } + Err(e) => { + return Err(ConnErr::FailedRead(e)); + } + } + + // We may have more than one message that has been read + loop { + if self.total_read < FRAME_HEADER_SIZE { + return Ok(()); + } + // Read frame size + let size = read_frame_size( + self.read_buf[..FRAME_HEADER_SIZE].try_into().unwrap(), + ); + let end = size + FRAME_HEADER_SIZE; + + // If we haven't read the whole message yet, then return + if end > self.total_read { + return Ok(()); + } + let msg: WireMsg = + ciborium::from_reader(&self.read_buf[FRAME_HEADER_SIZE..end])?; + // Move any remaining bytes to the beginning of the buffer. + self.read_buf.copy_within(end..self.total_read, 0); + self.total_read = self.total_read - end; + + self.last_received_msg = Instant::now(); + debug!(self.log, "Received {msg:?}"); + match msg { + WireMsg::Tq(msg) => { + if let Err(e) = self + .main_tx + .send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::Received { + from: self.peer_id.clone(), + msg, + }, + }) + .await + { + warn!( + self.log, + "Failed to send received fsm msg to main task: {e:?}" + ); + } + } + WireMsg::Ping => { + // Nothing to do here, since Ping is just to keep us alive and + // we updated self.last_received_msg above. + } + WireMsg::NetworkConfig(config) => { + let generation = config.generation; + if let Err(e) = self + .main_tx + .send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::ReceivedNetworkConfig { + from: self.peer_id.clone(), + config, + }, + }) + .await + { + warn!( + self.log, + "Failed to send received NetworkConfig with + generation {generation} to main task: {e:?}" + ); + } + } + } + } + } + + async fn check_write_result( + &mut self, + res: Result, + ) -> Result<(), ConnErr> { + match res { + Ok(_) => { + if !self.current_write.has_remaining() { + self.current_write = Cursor::new(Vec::new()); + } + Ok(()) + } + Err(e) => { + let _ = self.writer.shutdown().await; + Err(ConnErr::FailedWrite(e)) + } + } + } + + async fn on_msg_from_main( + &mut self, + msg: MainToConnMsg, + ) -> Result<(), ConnErr> { + match msg { + MainToConnMsg::Close => { + return Err(ConnErr::Close); + } + MainToConnMsg::Msg(msg) => self.write_framed_to_queue(msg).await, + } + } + + async fn write_framed_to_queue( + &mut self, + msg: WireMsg, + ) -> Result<(), ConnErr> { + if self.write_queue.len() == MSG_WRITE_QUEUE_CAPACITY { + return Err(ConnErr::WriteQueueFull); + } else { + let msg = write_framed(&msg)?; + self.write_queue.push_back(msg); + Ok(()) + } + } + + async fn ping(&mut self) -> Result<(), ConnErr> { + if Instant::now() - self.last_received_msg > INACTIVITY_TIMEOUT { + return Err(ConnErr::InactivityTimeout); + } + self.write_framed_to_queue(WireMsg::Ping).await + } +} + +// Decode the 4-byte big-endian frame size header +fn read_frame_size(buf: [u8; FRAME_HEADER_SIZE]) -> usize { + u32::from_be_bytes(buf) as usize +} + +/// Serialize and write `msg` into `buf`, prefixed by a 4-byte big-endian size +/// header +/// +/// Return the total amount of data written into `buf` including the 4-byte +/// header. +fn write_framed( + msg: &T, +) -> Result, ciborium::ser::Error> { + let mut cursor = Cursor::new(vec![]); + // Write a size placeholder + std::io::Write::write(&mut cursor, &[0u8; FRAME_HEADER_SIZE])?; + cursor.set_position(FRAME_HEADER_SIZE as u64); + ciborium::into_writer(msg, &mut cursor)?; + let size: u32 = + (cursor.position() - FRAME_HEADER_SIZE as u64).try_into().unwrap(); + let mut buf = cursor.into_inner(); + buf[0..FRAME_HEADER_SIZE].copy_from_slice(&size.to_be_bytes()); + Ok(buf) +} diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index a389022af0f..b4b18736163 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -21,6 +21,7 @@ mod compute_key_share; mod configuration; mod coordinator_state; pub(crate) mod crypto; +pub(crate) mod established_conn; mod messages; mod node; mod node_ctx; @@ -38,6 +39,13 @@ pub use validators::{ ValidatedLrtqUpgradeMsgDiff, ValidatedReconfigureMsgDiff, }; mod alarm; +mod connection_manager; +mod task; + +pub(crate) use connection_manager::{ + ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, TaskId, WireMsg, +}; +pub use task::NodeTask; pub use alarm::Alarm; pub use crypto::RackSecret; diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs new file mode 100644 index 00000000000..26d92c3783e --- /dev/null +++ b/trust-quorum/src/task.rs @@ -0,0 +1,469 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A runnable async trust quorum node that wraps the sans-io [`crate::Node`] + +use crate::connection_manager::{ + ConnMgr, ConnMgrStatus, ConnToMainMsg, ConnToMainMsgInner, +}; +use crate::{BaseboardId, Node, NodeCtx}; +use slog::{Logger, debug, error, info, o}; +use sprockets_tls::keys::SprocketsConfig; +use std::collections::BTreeSet; +use std::net::SocketAddrV6; +use thiserror::Error; +use tokio::sync::mpsc::error::SendError; +use tokio::sync::oneshot::error::RecvError; +use tokio::sync::{mpsc, oneshot}; + +#[derive(Debug, Clone)] +pub struct Config { + pub baseboard_id: BaseboardId, + pub listen_addr: SocketAddrV6, + // pub tq_state_ledger_paths: Vec, + // pub network_config_ledger_paths: Vec, + pub sprockets: SprocketsConfig, +} + +/// A request sent to the `NodeTask` from the `NodeTaskHandle` +pub enum NodeApiRequest { + /// Inform the `Node` of currently known IP addresses on the bootstrap network + /// + /// These are generated from DDM prefixes learned by the bootstrap agent. + BootstrapAddresses(BTreeSet), + + /// Retrieve connectivity status via the `ConnMgr` + ConnMgrStatus { responder: oneshot::Sender }, + + /// Shutdown the node's tokio tasks + Shutdown, +} + +/// An error response from a `NodeApiRequest` +#[derive(Error, Debug, PartialEq)] +pub enum NodeApiError { + #[error("Failed to send request to node task")] + Send, + #[error("Failed to receive response from node task")] + Recv, +} + +impl From> for NodeApiError { + fn from(_: SendError) -> Self { + NodeApiError::Send + } +} + +impl From for NodeApiError { + fn from(_: RecvError) -> Self { + NodeApiError::Recv + } +} + +#[derive(Debug, Clone)] +pub struct NodeTaskHandle { + baseboard_id: BaseboardId, + tx: mpsc::Sender, + listen_addr: SocketAddrV6, +} + +impl NodeTaskHandle { + /// Return the actual port being listened on + /// + /// This is useful when the port passed in was `0`. + pub fn listen_addr(&self) -> SocketAddrV6 { + self.listen_addr + } + + pub fn baseboard_id(&self) -> &BaseboardId { + &self.baseboard_id + } + + /// Inform the node of currently known IP addresses on the bootstrap network + /// + /// These are generated from DDM prefixes learned by the bootstrap agent. + pub async fn load_peer_addresses( + &self, + addrs: BTreeSet, + ) -> Result<(), NodeApiError> { + self.tx.send(NodeApiRequest::BootstrapAddresses(addrs)).await?; + Ok(()) + } + + pub async fn conn_mgr_status(&self) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx.send(NodeApiRequest::ConnMgrStatus { responder: tx }).await?; + let res = rx.await?; + Ok(res) + } + + pub async fn shutdown(&self) -> Result<(), NodeApiError> { + self.tx.send(NodeApiRequest::Shutdown).await?; + Ok(()) + } +} + +pub struct NodeTask { + shutdown: bool, + log: Logger, + #[allow(unused)] + config: Config, + #[allow(unused)] + node: Node, + #[allow(unused)] + ctx: NodeCtx, + conn_mgr: ConnMgr, + conn_mgr_rx: mpsc::Receiver, + + // Handle requests received from `PeerHandle` + rx: mpsc::Receiver, +} + +impl NodeTask { + pub async fn new( + config: Config, + log: &Logger, + ) -> (NodeTask, NodeTaskHandle) { + let log = log.new(o!( + "component" => "trust-quorum", + "baseboard_id" => config.baseboard_id.to_string() + )); + // We only expect one outstanding request at a time for `Init_` or + // `LoadRackSecret` requests, We can have one of those requests in + // flight while allowing `PeerAddresses` updates. We also allow status + // requests in parallel. Just leave some room. + let (tx, rx) = mpsc::channel(10); + + let (conn_mgr_tx, conn_mgr_rx) = mpsc::channel(100); + + let baseboard_id = config.baseboard_id.clone(); + + // TODO: Load persistent state from ledger + let mut ctx = NodeCtx::new(config.baseboard_id.clone()); + let node = Node::new(&log, &mut ctx); + let conn_mgr = ConnMgr::new( + &log, + config.listen_addr, + config.sprockets.clone(), + conn_mgr_tx, + ) + .await; + let listen_addr = conn_mgr.listen_addr(); + ( + NodeTask { + shutdown: false, + log, + config, + node, + ctx, + conn_mgr, + conn_mgr_rx, + rx, + }, + NodeTaskHandle { baseboard_id, tx, listen_addr }, + ) + } + + /// Run the main loop of the node + /// + /// This should be spawned into its own tokio task + pub async fn run(&mut self) { + while !self.shutdown { + // TODO: Real corpus + let corpus = vec![]; + tokio::select! { + Some(request) = self.rx.recv() => { + self.on_api_request(request).await; + } + res = self.conn_mgr.step(corpus.clone()) => { + if let Err(err) = res { + error!(self.log, "Failed to accept connection"; &err); + continue; + } + } + Some(msg) = self.conn_mgr_rx.recv() => { + self.on_conn_msg(msg).await + } + + } + } + } + + // Handle messages from connection management tasks + async fn on_conn_msg(&mut self, msg: ConnToMainMsg) { + let task_id = msg.task_id; + match msg.msg { + ConnToMainMsgInner::Accepted { addr, peer_id } => { + self.conn_mgr + .server_handshake_completed(task_id, addr, peer_id) + .await; + } + ConnToMainMsgInner::Connected { addr, peer_id } => { + self.conn_mgr + .client_handshake_completed(task_id, addr, peer_id) + .await; + } + ConnToMainMsgInner::Disconnected { peer_id } => { + self.conn_mgr.on_disconnected(task_id, peer_id).await; + } + ConnToMainMsgInner::Received { from: _, msg: _ } => { + todo!(); + } + ConnToMainMsgInner::ReceivedNetworkConfig { + from: _, + config: _, + } => { + todo!(); + } + } + } + + async fn on_api_request(&mut self, request: NodeApiRequest) { + match request { + NodeApiRequest::BootstrapAddresses(addrs) => { + info!(self.log, "Updated Peer Addresses: {addrs:?}"); + // TODO: real corpus + let corpus = vec![]; + self.conn_mgr.update_bootstrap_connections(addrs, corpus).await; + } + NodeApiRequest::ConnMgrStatus { responder } => { + debug!(self.log, "Received Request for ConnMgrStatus"); + let _ = responder.send(self.conn_mgr.status()); + } + NodeApiRequest::Shutdown => { + info!(self.log, "Shutting down Node tokio tasks"); + self.shutdown = true; + self.conn_mgr.shutdown().await; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::TaskId; + use crate::connection_manager::{ConnState, platform_id_to_baseboard_id}; + use camino::Utf8PathBuf; + use dropshot::test_util::log_prefix_for_test; + use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; + use omicron_test_utils::dev::test_setup_log; + use sprockets_tls::keys::ResolveSetting; + use sprockets_tls_test_utils::{ + alias_prefix, cert_path, certlist_path, private_key_path, root_prefix, + sprockets_auth_prefix, + }; + use std::time::Duration; + + fn pki_doc_to_node_configs(dir: Utf8PathBuf, n: usize) -> Vec { + (1..=n) + .map(|i| { + let baseboard_id = platform_id_to_baseboard_id( + &sprockets_tls_test_utils::platform_id(i), + ); + let listen_addr = + SocketAddrV6::new(std::net::Ipv6Addr::LOCALHOST, 0, 0, 0); + let sprockets_auth_key_name = sprockets_auth_prefix(i); + let alias_key_name = alias_prefix(i); + let sprockets = SprocketsConfig { + resolve: ResolveSetting::Local { + priv_key: private_key_path( + dir.clone(), + &sprockets_auth_key_name, + ), + cert_chain: certlist_path( + dir.clone(), + &sprockets_auth_key_name, + ), + }, + attest: sprockets_tls::keys::AttestConfig::Local { + priv_key: private_key_path( + dir.clone(), + &alias_key_name, + ), + cert_chain: certlist_path(dir.clone(), &alias_key_name), + // TODO: We need attest-mock to generate a real log + log: dir.join("log.bin"), + }, + roots: vec![cert_path(dir.clone(), &root_prefix())], + }; + Config { baseboard_id, listen_addr, sprockets } + }) + .collect() + } + + /// Test that all nodes can connect to each other when given each the full + /// set of "bootstrap addresses". + #[tokio::test] + async fn full_mesh_connectivity() { + let logctx = test_setup_log("full_mesh_connectivity"); + let (dir, _) = log_prefix_for_test("full_mesh_connectivity"); + println!("Writing keys and certs to {dir}"); + let num_nodes = 4; + + let file_behavior = + sprockets_tls_test_utils::OutputFileExistsBehavior::Overwrite; + + // Create `num_nodes` nodes worth of keys and certs + let doc = sprockets_tls_test_utils::generate_config(num_nodes); + doc.write_key_pairs(dir.clone(), file_behavior).unwrap(); + doc.write_certificates(dir.clone(), file_behavior).unwrap(); + doc.write_certificate_lists(dir.clone(), file_behavior).unwrap(); + + // This is just a made up digest. We aren't currently using a corpus, so it + // doesn't matter what the measurements are, just that there is at least + // one in a file named "log.bin". + let digest = + "be4df4e085175f3de0c8ac4837e1c2c9a34e8983209dac6b549e94154f7cdd9c" + .into(); + let attest_log_doc = attest_mock::log::Document { + measurements: vec![attest_mock::log::Measurement { + algorithm: "sha3-256".into(), + digest, + }], + }; + // Write out the log document to the filesystem + let out = attest_mock::log::mock(attest_log_doc).unwrap(); + std::fs::write(dir.join("log.bin"), &out).unwrap(); + + let configs = pki_doc_to_node_configs(dir, num_nodes); + + let mut node_handles = vec![]; + let mut join_handles = vec![]; + for config in configs.clone() { + let (mut task, handle) = NodeTask::new(config, &logctx.log).await; + node_handles.push(handle); + join_handles.push(tokio::spawn(async move { task.run().await })); + } + + let listen_addrs: BTreeSet<_> = + node_handles.iter().map(|h| h.listen_addr()).collect(); + + for h in &node_handles { + h.load_peer_addresses(listen_addrs.clone()).await.unwrap(); + } + + let poll_interval = Duration::from_millis(1); + let poll_max = Duration::from_secs(10); + + // Wait for all nodes have `num_nodes - 1` established connections + wait_for_condition( + async || { + let mut count = 0; + for h in &node_handles { + let status = h.conn_mgr_status().await.unwrap(); + if status + .connections + .iter() + .all(|c| matches!(c.state, ConnState::Established(_))) + && status.connections.len() == num_nodes - 1 + && status.next_task_id == TaskId::new(3) + { + count += 1; + } + } + if count == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Killing a single node should cause all other nodes to start + // reconnecting. This should cause the task id counter to start + // incrementing at all nodes and for their to be one fewer established + // connection. + let h = node_handles.pop().unwrap(); + h.shutdown().await.unwrap(); + let _ = join_handles.pop().unwrap(); + let stopped_addr = h.listen_addr; + + let poll_interval = Duration::from_millis(50); + wait_for_condition( + async || { + let mut valid = 0; + for h in &node_handles { + let status = h.conn_mgr_status().await.unwrap(); + let established_count = status + .connections + .iter() + .filter(|c| { + matches!(c.state, ConnState::Established(_)) + }) + .count(); + + // Nodes only connect to other nodes if their listening + // address sorts greater. The only node where a reconnect will be attempted + // is the stopped node. + let should_be_connecting = h.listen_addr > stopped_addr; + let valid_task_id = if should_be_connecting { + status.next_task_id > TaskId::new(3) + } else { + true + }; + if established_count == num_nodes - 2 && valid_task_id { + valid += 1; + } + } + if valid == num_nodes - 1 { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now let's bring back up the old node and ensure full connectivity again + let (mut task, handle) = + NodeTask::new(configs.last().unwrap().clone(), &logctx.log).await; + node_handles.push(handle.clone()); + join_handles.push(tokio::spawn(async move { task.run().await })); + + // The port likely changed, so we must refresh everyone's set of addresses + let listen_addrs: BTreeSet<_> = + node_handles.iter().map(|h| h.listen_addr()).collect(); + + for h in &node_handles { + h.load_peer_addresses(listen_addrs.clone()).await.unwrap(); + } + + // Wait for all nodes have `num_nodes - 1` established connections + wait_for_condition( + async || { + let mut count = 0; + for h in &node_handles { + let status = h.conn_mgr_status().await.unwrap(); + if status + .connections + .iter() + .all(|c| matches!(c.state, ConnState::Established(_))) + && status.connections.len() == num_nodes - 1 + { + count += 1; + } + } + if count == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + logctx.cleanup_successful(); + } +} From 5cd3454ff48beac2c0804ef9859eeb13980103c2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Sat, 18 Oct 2025 23:23:53 +0000 Subject: [PATCH 02/22] hakari --- Cargo.lock | 2 ++ workspace-hack/Cargo.toml | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7b194fff992..c01db75e58e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8940,6 +8940,7 @@ dependencies = [ "getrandom 0.3.4", "group", "hashbrown 0.16.0", + "heck 0.4.1", "hickory-proto 0.25.2", "hmac", "hyper", @@ -8961,6 +8962,7 @@ dependencies = [ "log", "managed", "memchr", + "miniz_oxide", "mio", "newtype-uuid", "nix 0.29.0", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index d7f415098b9..096d5d26dcd 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -64,6 +64,7 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.16.0", default-features = false, features = ["allocator-api2", "inline-more"] } +heck = { version = "0.4.1", features = ["unicode"] } hickory-proto = { version = "0.25.2", features = ["serde", "text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.7.0", features = ["full"] } @@ -108,7 +109,7 @@ regex = { version = "1.11.3" } regex-automata = { version = "0.4.11", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "std", "unicode"] } regex-syntax = { version = "0.8.5" } reqwest = { version = "0.12.22", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } -rsa = { version = "0.9.6", features = ["serde", "sha2"] } +rsa = { version = "0.9.8", features = ["serde", "sha2"] } rustc-hash = { version = "2.1.1" } rustls = { version = "0.23.19", features = ["ring"] } rustls-webpki = { version = "0.102.8", default-features = false, features = ["aws_lc_rs", "ring", "std"] } @@ -117,7 +118,7 @@ scopeguard = { version = "1.2.0" } semver = { version = "1.0.27", features = ["serde"] } serde = { version = "1.0.226", features = ["alloc", "derive", "rc"] } serde_core = { version = "1.0.226", features = ["alloc", "rc"] } -serde_json = { version = "1.0.145", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.145", features = ["alloc", "raw_value", "unbounded_depth"] } serde_with = { version = "3.14.0" } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.9", features = ["oid"] } @@ -202,6 +203,7 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.16.0", default-features = false, features = ["allocator-api2", "inline-more"] } +heck = { version = "0.4.1", features = ["unicode"] } hickory-proto = { version = "0.25.2", features = ["serde", "text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.7.0", features = ["full"] } @@ -246,7 +248,7 @@ regex = { version = "1.11.3" } regex-automata = { version = "0.4.11", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "std", "unicode"] } regex-syntax = { version = "0.8.5" } reqwest = { version = "0.12.22", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } -rsa = { version = "0.9.6", features = ["serde", "sha2"] } +rsa = { version = "0.9.8", features = ["serde", "sha2"] } rustc-hash = { version = "2.1.1" } rustls = { version = "0.23.19", features = ["ring"] } rustls-webpki = { version = "0.102.8", default-features = false, features = ["aws_lc_rs", "ring", "std"] } @@ -255,7 +257,7 @@ scopeguard = { version = "1.2.0" } semver = { version = "1.0.27", features = ["serde"] } serde = { version = "1.0.226", features = ["alloc", "derive", "rc"] } serde_core = { version = "1.0.226", features = ["alloc", "rc"] } -serde_json = { version = "1.0.145", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.145", features = ["alloc", "raw_value", "unbounded_depth"] } serde_with = { version = "3.14.0" } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.9", features = ["oid"] } @@ -305,6 +307,7 @@ getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } linux-raw-sys = { version = "0.4.14", default-features = false, features = ["elf", "errno", "general", "if_ether", "ioctl", "net", "netlink", "no_std", "prctl", "std", "system", "xdp"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -318,6 +321,7 @@ getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } linux-raw-sys = { version = "0.4.14", default-features = false, features = ["elf", "errno", "general", "if_ether", "ioctl", "net", "netlink", "no_std", "prctl", "std", "system", "xdp"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -328,6 +332,7 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default-features = false, features = ["std"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -338,6 +343,7 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default-features = false, features = ["std"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -348,6 +354,7 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default-features = false, features = ["std"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -358,6 +365,7 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3.4", default-features = false, features = ["std"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.17", features = ["full"] } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } rustix-dff4ba8e3ae991db = { package = "rustix", version = "1.0.7", features = ["fs", "stdio", "termios"] } @@ -372,6 +380,7 @@ hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio" hyper-util = { version = "0.1.17", features = ["full"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } nom = { version = "7.1.3" } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } @@ -390,6 +399,7 @@ hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio" hyper-util = { version = "0.1.17", features = ["full"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } +miniz_oxide = { version = "0.8.5", default-features = false, features = ["with-alloc"] } mio = { version = "1.0.2", features = ["net", "os-ext"] } nom = { version = "7.1.3" } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.37", features = ["event", "fs", "net", "pipe", "process", "stdio", "system", "termios", "time"] } From a8e8be6ba8224e97da5e21856c8f1a1cddda5a13 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 16:28:47 +0000 Subject: [PATCH 03/22] Fix up step method --- trust-quorum/src/connection_manager.rs | 50 ++++++++++++++------------ trust-quorum/src/task.rs | 10 +++++- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 4d24e365148..e98a88b2b97 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -21,18 +21,18 @@ use std::net::{SocketAddr, SocketAddrV4, SocketAddrV6}; use std::time::Duration; use tokio::sync::mpsc; use tokio::task::JoinHandle; -use tokio::time::{MissedTickBehavior, interval}; +use tokio::time::{Interval, MissedTickBehavior, interval}; /// We only expect a handful of concurrent requests at most. const CHANNEL_BOUND: usize = 10; // Time between checks to see if we need to reconnect to to any peers -const RECONNECT_TIME: Duration = Duration::from_secs(5); +pub const RECONNECT_TIME: Duration = Duration::from_secs(5); /// An error returned from `ConnMgr::accept` #[derive(Debug, thiserror::Error, SlogInlineError)] pub enum AcceptError { - #[error("Accepted connection from IPv4 address {addr}. Only IPv6 allowed.")] + #[error("accepted connection from IPv4 address {addr}. Only IPv6 allowed.")] Ipv4Accept { addr: SocketAddrV4 }, #[error("sprockets error")] @@ -209,6 +209,9 @@ pub struct ConnMgr { /// All tasks containing established connections that can be used to communicate /// with other nodes. established: BTreeMap, + + /// An interval for reconnect operations + reconnect_interval: Interval, } impl ConnMgr { @@ -246,6 +249,9 @@ impl ConnMgr { "local_addr" => %listen_addr ); + let mut reconnect_interval = interval(RECONNECT_TIME); + reconnect_interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + ConnMgr { log, main_tx, @@ -258,6 +264,7 @@ impl ConnMgr { connecting: BTreeMap::new(), accepting: BTreeMap::new(), established: BTreeMap::new(), + reconnect_interval, } } @@ -315,30 +322,27 @@ impl ConnMgr { &mut self, corpus: Vec, ) -> Result<(), AcceptError> { - let mut interval = interval(RECONNECT_TIME); - interval.set_missed_tick_behavior(MissedTickBehavior::Delay); - - loop { - tokio::select! { - acceptor = self.server.accept(corpus.clone()) => { - self.accept(acceptor?).await?; - } - Some(res) = self.join_handles.next() => { - match res { - Ok(task_id) => { - self.on_task_exit(task_id).await; - } - Err(err) => { - error!(self.log, "Connection task panic: {}", err); - } - + tokio::select! { + acceptor = self.server.accept(corpus.clone()) => { + self.accept(acceptor?).await?; + } + Some(res) = self.join_handles.next() => { + match res { + Ok(task_id) => { + self.on_task_exit(task_id).await; } - } - _ = interval.tick() => { - self.reconnect(corpus.clone()).await; + Err(err) => { + error!(self.log, "Connection task panic: {}", err); + } + } } + _ = self.reconnect_interval.tick() => { + self.reconnect(corpus.clone()).await; + } } + + Ok(()) } pub async fn accept( diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 26d92c3783e..85438fe43ed 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -244,7 +244,9 @@ impl NodeTask { mod tests { use super::*; use crate::TaskId; - use crate::connection_manager::{ConnState, platform_id_to_baseboard_id}; + use crate::connection_manager::{ + ConnState, RECONNECT_TIME, platform_id_to_baseboard_id, + }; use camino::Utf8PathBuf; use dropshot::test_util::log_prefix_for_test; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; @@ -375,6 +377,9 @@ mod tests { .await .unwrap(); + // Pause time so we can jump it for reconnects + tokio::time::pause(); + // Killing a single node should cause all other nodes to start // reconnecting. This should cause the task id counter to start // incrementing at all nodes and for their to be one fewer established @@ -384,6 +389,9 @@ mod tests { let _ = join_handles.pop().unwrap(); let stopped_addr = h.listen_addr; + // Speed up reconnection in the test + tokio::time::advance(RECONNECT_TIME).await; + let poll_interval = Duration::from_millis(50); wait_for_condition( async || { From 9c2a7e35aa2376cdfbb40253646cb74162962fbf Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 17:35:29 +0000 Subject: [PATCH 04/22] Use JoinSet and tokio::task::Id instead of FuturesUnordered and crate::TaskId --- trust-quorum/src/connection_manager.rs | 104 ++++++++++--------------- trust-quorum/src/established_conn.rs | 8 +- trust-quorum/src/lib.rs | 2 +- trust-quorum/src/task.rs | 5 +- 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index e98a88b2b97..f5a2f4cbf45 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -9,8 +9,6 @@ use crate::{BaseboardId, PeerMsg}; // TODO: Move or copy this to this crate? use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; -use futures::StreamExt; -use futures::stream::FuturesUnordered; use serde::{Deserialize, Serialize}; use slog::{Logger, debug, error, info, o, warn}; use slog_error_chain::SlogInlineError; @@ -20,7 +18,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::net::{SocketAddr, SocketAddrV4, SocketAddrV6}; use std::time::Duration; use tokio::sync::mpsc; -use tokio::task::JoinHandle; +use tokio::task::{self, JoinSet}; use tokio::time::{Interval, MissedTickBehavior, interval}; /// We only expect a handful of concurrent requests at most. @@ -43,23 +41,6 @@ pub enum AcceptError { ), } -/// A mechanism for uniquely identifying a task managing a connection -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct TaskId(u64); - -impl TaskId { - pub fn new(id: u64) -> TaskId { - TaskId(id) - } - - /// Increment the ID and then return the value before the increment - pub fn inc(&mut self) -> TaskId { - let id = *self; - self.0 += 1; - id - } -} - /// Messages sent from the main task to the connection managing tasks #[derive(Debug, PartialEq)] pub enum MainToConnMsg { @@ -103,7 +84,7 @@ pub enum WireMsg { /// shutdown. #[derive(Debug, PartialEq)] pub struct ConnToMainMsg { - pub task_id: TaskId, + pub task_id: task::Id, pub msg: ConnToMainMsgInner, } @@ -117,7 +98,7 @@ pub enum ConnToMainMsgInner { } pub struct TaskHandle { - pub task_id: TaskId, + pub task_id: task::Id, pub tx: mpsc::Sender, pub conn_type: ConnectionType, } @@ -154,7 +135,7 @@ pub enum ConnState { pub struct ConnInfo { pub state: ConnState, pub addr: SocketAddrV6, - pub task_id: TaskId, + pub task_id: task::Id, } /// Status information useful for debugging @@ -162,8 +143,8 @@ pub struct ConnInfo { pub struct ConnMgrStatus { pub bootstrap_addrs: BTreeSet, pub connections: Vec, - pub num_task_join_handles: u64, - pub next_task_id: TaskId, + pub num_conn_tasks: u64, + pub total_tasks_spawned: u64, } /// A structure to manage all sprockets connections to peer nodes @@ -188,13 +169,8 @@ pub struct ConnMgr { /// The address the sprockets server listens on listen_addr: SocketAddrV6, - // A unique, monotonically incrementing id for each task to help map tasks - // to their handles in case the task aborts, or there is a new connection - // accepted and established for an existing `BaseboardId`. - next_task_id: TaskId, - - /// `JoinHandle`s to all tasks that can be polled for crashes - join_handles: FuturesUnordered>, + /// A mechanism for spawning connection tasks + join_set: JoinSet<()>, /// All known addresses on the bootstrap network, learned via DDMD bootstrap_addrs: BTreeSet, @@ -212,6 +188,9 @@ pub struct ConnMgr { /// An interval for reconnect operations reconnect_interval: Interval, + + /// The number of total connection tasks spawned + total_tasks_spawned: u64, } impl ConnMgr { @@ -258,13 +237,13 @@ impl ConnMgr { config, server, listen_addr, - next_task_id: TaskId::new(0), - join_handles: Default::default(), + join_set: JoinSet::new(), bootstrap_addrs: BTreeSet::new(), connecting: BTreeMap::new(), accepting: BTreeMap::new(), established: BTreeMap::new(), reconnect_interval, + total_tasks_spawned: 0, } } @@ -307,8 +286,8 @@ impl ConnMgr { ConnMgrStatus { bootstrap_addrs: self.bootstrap_addrs.clone(), connections, - num_task_join_handles: self.join_handles.len() as u64, - next_task_id: self.next_task_id, + num_conn_tasks: self.join_set.len() as u64, + total_tasks_spawned: self.total_tasks_spawned, } } @@ -326,13 +305,14 @@ impl ConnMgr { acceptor = self.server.accept(corpus.clone()) => { self.accept(acceptor?).await?; } - Some(res) = self.join_handles.next() => { + Some(res) = self.join_set.join_next_with_id() => { match res { - Ok(task_id) => { + Ok((task_id, _)) => { self.on_task_exit(task_id).await; } Err(err) => { error!(self.log, "Connection task panic: {}", err); + self.on_task_exit(err.id()).await; } } @@ -356,15 +336,9 @@ impl ConnMgr { SocketAddr::V6(addr) => addr, }; let log = self.log.clone(); - let task_id = self.next_task_id.inc(); let (tx, rx) = mpsc::channel(CHANNEL_BOUND); - let task_handle = TaskHandle { - task_id, - tx, - conn_type: ConnectionType::Accepted(addr), - }; let main_tx = self.main_tx.clone(); - let join_handle = tokio::spawn(async move { + let abort_handle = self.join_set.spawn(async move { match acceptor.handshake().await { Ok((stream, _)) => { let platform_id = @@ -380,7 +354,7 @@ impl ConnMgr { let mut conn = EstablishedConn::new( baseboard_id.clone(), - task_id, + task::id(), stream, main_tx.clone(), rx, @@ -390,7 +364,7 @@ impl ConnMgr { // Inform the main task that accepted connection is established if let Err(e) = main_tx .send(ConnToMainMsg { - task_id: task_id, + task_id: task::id(), msg: ConnToMainMsgInner::Accepted { addr, peer_id: baseboard_id, @@ -412,16 +386,20 @@ impl ConnMgr { error!(log, "Failed to accept a connection"; &err); } } - task_id }); - self.join_handles.push(join_handle); + self.total_tasks_spawned += 1; + let task_handle = TaskHandle { + task_id: abort_handle.id(), + tx, + conn_type: ConnectionType::Accepted(addr), + }; self.accepting.insert(addr, task_handle); Ok(()) } pub async fn server_handshake_completed( &mut self, - task_id: TaskId, + task_id: task::Id, addr: SocketAddrV6, peer_id: BaseboardId, ) { @@ -441,7 +419,7 @@ impl ConnMgr { pub async fn client_handshake_completed( &mut self, - task_id: TaskId, + task_id: task::Id, addr: SocketAddrV6, peer_id: BaseboardId, ) { @@ -463,7 +441,7 @@ impl ConnMgr { /// The established connection task has asynchronously exited. pub async fn on_disconnected( &mut self, - task_id: TaskId, + task_id: task::Id, peer_id: BaseboardId, ) { if let Some(task_handle) = self.established.get(&peer_id) { @@ -555,18 +533,12 @@ impl ConnMgr { corpus: Vec, addr: SocketAddrV6, ) { - let task_id = self.next_task_id.inc(); let (tx, rx) = mpsc::channel(CHANNEL_BOUND); - let task_handle = TaskHandle { - task_id, - tx, - conn_type: ConnectionType::Connected(addr), - }; info!(self.log, "Initiating connection to new peer: {addr}"); let main_tx = self.main_tx.clone(); let log = self.log.clone(); let config = self.config.clone(); - let join_handle = tokio::spawn(async move { + let abort_handle = self.join_set.spawn(async move { match sprockets_tls::Client::connect( config, addr, @@ -589,7 +561,7 @@ impl ConnMgr { let mut conn = EstablishedConn::new( baseboard_id.clone(), - task_id, + task::id(), stream, main_tx.clone(), rx, @@ -599,7 +571,7 @@ impl ConnMgr { // established. if let Err(e) = main_tx .send(ConnToMainMsg { - task_id: task_id, + task_id: task::id(), msg: ConnToMainMsgInner::Connected { addr, peer_id: baseboard_id, @@ -621,9 +593,13 @@ impl ConnMgr { warn!(log, "Failed to connect"; &err); } } - task_id }); - self.join_handles.push(join_handle); + self.total_tasks_spawned += 1; + let task_handle = TaskHandle { + task_id: abort_handle.id(), + tx, + conn_type: ConnectionType::Connected(addr), + }; self.connecting.insert(addr, task_handle); } @@ -662,7 +638,7 @@ impl ConnMgr { } /// Remove any references to the given task - async fn on_task_exit(&mut self, task_id: TaskId) { + async fn on_task_exit(&mut self, task_id: task::Id) { // We're most likely to find the task as established so we start with that if let Some((id, handle)) = self .established diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index 01a04bd9e76..c5a4c9dd11a 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -5,8 +5,7 @@ //! An individual sprockets connection running in its own task use crate::{ - BaseboardId, ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, TaskId, - WireMsg, + BaseboardId, ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg, }; use bytes::Buf; use serde::Serialize; @@ -18,6 +17,7 @@ use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt, ReadHalf, WriteHalf, split}; use tokio::net::TcpStream; use tokio::sync::mpsc; +use tokio::task; use tokio::time::{Instant, MissedTickBehavior, interval}; /// Max buffer size of a connection @@ -63,7 +63,7 @@ pub enum ConnErr { /// Container for code running in its own task per sprockets connection pub struct EstablishedConn { peer_id: BaseboardId, - task_id: TaskId, + task_id: task::Id, reader: ReadHalf>, writer: WriteHalf>, main_tx: mpsc::Sender, @@ -94,7 +94,7 @@ pub struct EstablishedConn { impl EstablishedConn { pub fn new( peer_id: BaseboardId, - task_id: TaskId, + task_id: task::Id, stream: sprockets_tls::Stream, main_tx: mpsc::Sender, rx: mpsc::Receiver, diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index b4b18736163..457403999e4 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -43,7 +43,7 @@ mod connection_manager; mod task; pub(crate) use connection_manager::{ - ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, TaskId, WireMsg, + ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg, }; pub use task::NodeTask; diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 85438fe43ed..ef82aa93063 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -243,7 +243,6 @@ impl NodeTask { #[cfg(test)] mod tests { use super::*; - use crate::TaskId; use crate::connection_manager::{ ConnState, RECONNECT_TIME, platform_id_to_baseboard_id, }; @@ -360,7 +359,7 @@ mod tests { .iter() .all(|c| matches!(c.state, ConnState::Established(_))) && status.connections.len() == num_nodes - 1 - && status.next_task_id == TaskId::new(3) + && status.total_tasks_spawned == 3 { count += 1; } @@ -411,7 +410,7 @@ mod tests { // is the stopped node. let should_be_connecting = h.listen_addr > stopped_addr; let valid_task_id = if should_be_connecting { - status.next_task_id > TaskId::new(3) + status.total_tasks_spawned > 3 } else { true }; From e1e49ea5c97deab079f039645728304bb1333931 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 19:22:18 +0000 Subject: [PATCH 05/22] logging cleanup --- trust-quorum/src/connection_manager.rs | 202 +++++++++++++------------ 1 file changed, 104 insertions(+), 98 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index f5a2f4cbf45..dc142b353db 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -339,52 +339,51 @@ impl ConnMgr { let (tx, rx) = mpsc::channel(CHANNEL_BOUND); let main_tx = self.main_tx.clone(); let abort_handle = self.join_set.spawn(async move { - match acceptor.handshake().await { - Ok((stream, _)) => { - let platform_id = - stream.peer_platform_id().as_str().unwrap(); - let baseboard_id = platform_id_to_baseboard_id(platform_id); - - // TODO: Conversion between `PlatformId` and `BaseboardId` should - // happen in `sled-agent-types`. This is waiting on an update - // to the `dice-mfg-msgs` crate. - let log = - log.new(o!("peer_id" => baseboard_id.to_string())); - info!(log, "Accepted sprockets connection"; "addr" => %addr); - - let mut conn = EstablishedConn::new( - baseboard_id.clone(), - task::id(), - stream, - main_tx.clone(), - rx, - &log, - ); - - // Inform the main task that accepted connection is established - if let Err(e) = main_tx - .send(ConnToMainMsg { - task_id: task::id(), - msg: ConnToMainMsgInner::Accepted { - addr, - peer_id: baseboard_id, - }, - }) - .await - { - // The system is shutting down - // Just bail from this task - warn!( - log, - "Failed to send 'accepted' msg to main task: {e:?}" - ); - } else { - conn.run().await; - } - } + let stream = match acceptor.handshake().await { + Ok((stream, _)) => stream, + Err(err) => { error!(log, "Failed to accept a connection"; &err); + return (); } + }; + let platform_id = stream.peer_platform_id().as_str().unwrap(); + let baseboard_id = platform_id_to_baseboard_id(platform_id); + + // TODO: Conversion between `PlatformId` and `BaseboardId` should + // happen in `sled-agent-types`. This is waiting on an update + // to the `dice-mfg-msgs` crate. + let log = log.new(o!( + "peer_id" => baseboard_id.to_string(), + "peer_addr" => addr.to_string() + )); + info!(log, "Accepted sprockets connection"); + + let mut conn = EstablishedConn::new( + baseboard_id.clone(), + task::id(), + stream, + main_tx.clone(), + rx, + &log, + ); + + // Inform the main task that accepted connection is established + if let Err(e) = main_tx + .send(ConnToMainMsg { + task_id: task::id(), + msg: ConnToMainMsgInner::Accepted { + addr, + peer_id: baseboard_id, + }, + }) + .await + { + // The system is shutting down + // Just bail from this task + warn!(log, "Failed to send 'accepted' msg to main task: {e:?}"); + } else { + conn.run().await; } }); self.total_tasks_spawned += 1; @@ -408,8 +407,8 @@ impl ConnMgr { self.log, "Established server connection"; "task_id" => ?task_id, - "remote_addr" => %addr, - "remote_peer_id" => peer_id.to_string() + "peer_addr" => %addr, + "peer_id" => %peer_id ); let already_established = self.established.insert(peer_id, task_handle); @@ -428,13 +427,19 @@ impl ConnMgr { self.log, "Established client connection"; "task_id" => ?task_id, - "remote_addr" => %addr, - "remote_peer_id" => peer_id.to_string() + "peer_addr" => %addr, + "peer_id" => %peer_id ); let already_established = self.established.insert(peer_id, task_handle); assert!(already_established.is_none()); + } else { + error!(self.log, "Client handshake completed, but no client addr in map"; + "task_id" => ?task_id, + "peer_addr" => %addr, + "peer_id" => %peer_id + ); } } @@ -539,7 +544,7 @@ impl ConnMgr { let log = self.log.clone(); let config = self.config.clone(); let abort_handle = self.join_set.spawn(async move { - match sprockets_tls::Client::connect( + let stream = match sprockets_tls::Client::connect( config, addr, corpus.clone(), @@ -547,51 +552,52 @@ impl ConnMgr { ) .await { - Ok(stream) => { - let platform_id = - stream.peer_platform_id().as_str().unwrap(); - let baseboard_id = platform_id_to_baseboard_id(platform_id); - - // TODO: Conversion between `PlatformId` and `BaseboardId` should - // happen in `sled-agent-types`. This is waiting on an update - // to the `dice-mfg-msgs` crate. - let log = - log.new(o!("peer_id" => baseboard_id.to_string())); - info!(log, "Sprockets connection established"; "addr" => %addr); - - let mut conn = EstablishedConn::new( - baseboard_id.clone(), - task::id(), - stream, - main_tx.clone(), - rx, - &log, - ); - // Inform the main task that the client connection is - // established. - if let Err(e) = main_tx - .send(ConnToMainMsg { - task_id: task::id(), - msg: ConnToMainMsgInner::Connected { - addr, - peer_id: baseboard_id, - }, - }) - .await - { - // The system is shutting down - // Just bail from this task - error!( - log, - "Failed to send 'connected' msg to main task: {e:?}" - ); - } else { - conn.run().await; - } - } + Ok(stream) => stream, Err(err) => { - warn!(log, "Failed to connect"; &err); + warn!(log, "Failed to connect"; "peer_addr"=> %addr, &err); + return (); } + }; + let platform_id = stream.peer_platform_id().as_str().unwrap(); + let baseboard_id = platform_id_to_baseboard_id(platform_id); + + // TODO: Conversion between `PlatformId` and `BaseboardId` should + // happen in `sled-agent-types`. This is waiting on an update + // to the `dice-mfg-msgs` crate. + let log = log.new(o!( + "peer_id" => baseboard_id.to_string(), + "peer_addr" => addr.to_string() + )); + info!(log, "Sprockets connection established"); + + let mut conn = EstablishedConn::new( + baseboard_id.clone(), + task::id(), + stream, + main_tx.clone(), + rx, + &log, + ); + // Inform the main task that the client connection is + // established. + if let Err(e) = main_tx + .send(ConnToMainMsg { + task_id: task::id(), + msg: ConnToMainMsgInner::Connected { + addr, + peer_id: baseboard_id, + }, + }) + .await + { + // The system is shutting down + // Just bail from this task + error!( + log, + "Failed to send 'connected' msg to main task: {e:?}" + ); + } else { + conn.run().await; } }); self.total_tasks_spawned += 1; @@ -614,7 +620,7 @@ impl ConnMgr { info!( self.log, "Deleting initiating connection"; - "remote_addr" => addr.to_string() + "remote_addr" => %addr ); let _ = handle.tx.send(MainToConnMsg::Close).await; } else { @@ -626,8 +632,8 @@ impl ConnMgr { info!( self.log, "Deleting established connection"; - "remote_addr" => addr.to_string(), - "remote_peer_id" => id.to_string(), + "peer_addr" => %addr, + "peer_id" => %id ); let _ = handle.tx.send(MainToConnMsg::Close).await; // probably a better way to avoid borrowck issues @@ -649,8 +655,8 @@ impl ConnMgr { self.log, "Established connection task exited"; "task_id" => ?task_id, - "remote_addr" => handle.addr().to_string(), - "remote_peer_id" => id.to_string(), + "peer_addr" => %handle.addr(), + "peer_id" => %id ); // probably a better way to avoid borrowck issues let id = id.clone(); @@ -662,7 +668,7 @@ impl ConnMgr { self.log, "Accepting task exited"; "task_id" => ?task_id, - "remote_addr" => handle.addr().to_string(), + "peer_addr" => %handle.addr() ); let addr = *addr; self.accepting.remove(&addr); @@ -673,7 +679,7 @@ impl ConnMgr { self.log, "Connecting task exited"; "task_id" => ?task_id, - "remote_addr" => handle.addr().to_string(), + "peer_addr" => %handle.addr() ); let addr = *addr; self.connecting.remove(&addr); From 0a5a5e0c037f892328029c76f5b3381fd62b3d8e Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 19:29:05 +0000 Subject: [PATCH 06/22] more review cleanup --- trust-quorum/src/connection_manager.rs | 8 +++++--- trust-quorum/src/task.rs | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index dc142b353db..2da28d659e6 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -45,7 +45,7 @@ pub enum AcceptError { #[derive(Debug, PartialEq)] pub enum MainToConnMsg { Close, - #[allow(unused)] + #[expect(unused)] Msg(WireMsg), } @@ -491,8 +491,10 @@ impl ConnMgr { /// The set of known addresses on the bootstrap network has changed /// - /// We need to connect to peers with addresses less than our own - /// and tear down any connections that no longer exist in `addrs`. + /// We only want a single connection between known peers at a time. The + /// easiest way to achieve this is to only connect to peers with addresses + /// that sort less than our own and tear down any connections that no longer + /// exist in `addrs`. pub async fn update_bootstrap_connections( &mut self, addrs: BTreeSet, diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index ef82aa93063..4585697f291 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -107,11 +107,11 @@ impl NodeTaskHandle { pub struct NodeTask { shutdown: bool, log: Logger, - #[allow(unused)] + #[expect(unused)] config: Config, - #[allow(unused)] + #[expect(unused)] node: Node, - #[allow(unused)] + #[expect(unused)] ctx: NodeCtx, conn_mgr: ConnMgr, conn_mgr_rx: mpsc::Receiver, From ffbd488fb4a0ac0392139df9c2c79acdc8cb27d2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 19:41:50 +0000 Subject: [PATCH 07/22] sock writer shutdown works again --- trust-quorum/src/established_conn.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index c5a4c9dd11a..209cb0af3f7 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -171,16 +171,7 @@ impl EstablishedConn { { warn!(self.log, "Failed to send to main task: {e:?}"); } - // TODO: This causes a deadlock and breaks the test. - // - // I'm unclear why, although I plan to dig a bit further in the future. - // It should be noted that the writer and reader share a std::mutex - // under the hood and that could be causing issues. Regardless, it - // is not actually critical to issue a shutdown as detection will - // be discovered via missing ping messages at the other end of the - // connection. - // - // let _ = self.writer.shutdown().await; + let _ = self.writer.shutdown().await; } async fn on_read( From 4245fd01c00d48672214718ee01e8bd7bf182ec6 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 19:51:25 +0000 Subject: [PATCH 08/22] clippy --- trust-quorum/src/connection_manager.rs | 4 ++-- trust-quorum/src/task.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 2da28d659e6..bffb0190de9 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -480,7 +480,7 @@ impl ConnMgr { continue; } - to_connect.push(addr.clone()); + to_connect.push(*addr); } for addr in to_connect { @@ -700,6 +700,6 @@ impl ConnMgr { pub fn platform_id_to_baseboard_id(platform_id: &str) -> BaseboardId { let mut platform_id_iter = platform_id.split(":"); let part_number = platform_id_iter.nth(1).unwrap().to_string(); - let serial_number = platform_id_iter.skip(1).next().unwrap().to_string(); + let serial_number = platform_id_iter.nth(2).unwrap().to_string(); BaseboardId { part_number, serial_number } } diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 4585697f291..4110259e867 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -385,7 +385,7 @@ mod tests { // connection. let h = node_handles.pop().unwrap(); h.shutdown().await.unwrap(); - let _ = join_handles.pop().unwrap(); + join_handles.pop().unwrap(); let stopped_addr = h.listen_addr; // Speed up reconnection in the test From de3b4e08c1adb900689a00b228a39e31b262b3b2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 22:22:44 +0000 Subject: [PATCH 09/22] Review comments --- trust-quorum/src/connection_manager.rs | 19 +++---------------- trust-quorum/src/established_conn.rs | 19 ++++++------------- trust-quorum/src/task.rs | 1 - 3 files changed, 9 insertions(+), 30 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index bffb0190de9..9b9f1a7e1ec 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -247,19 +247,6 @@ impl ConnMgr { } } - pub async fn shutdown(&mut self) { - // Shutdown all connection processing tasks - for (_, handle) in &self.accepting { - let _ = handle.tx.send(MainToConnMsg::Close).await; - } - for (_, handle) in &self.connecting { - let _ = handle.tx.send(MainToConnMsg::Close).await; - } - for (_, handle) in &self.established { - let _ = handle.tx.send(MainToConnMsg::Close).await; - } - } - pub fn status(&self) -> ConnMgrStatus { let connections = self .connecting @@ -311,7 +298,7 @@ impl ConnMgr { self.on_task_exit(task_id).await; } Err(err) => { - error!(self.log, "Connection task panic: {}", err); + error!(self.log, "Connection task panic: {err}"); self.on_task_exit(err.id()).await; } @@ -344,7 +331,7 @@ impl ConnMgr { Err(err) => { error!(log, "Failed to accept a connection"; &err); - return (); + return; } }; let platform_id = stream.peer_platform_id().as_str().unwrap(); @@ -700,6 +687,6 @@ impl ConnMgr { pub fn platform_id_to_baseboard_id(platform_id: &str) -> BaseboardId { let mut platform_id_iter = platform_id.split(":"); let part_number = platform_id_iter.nth(1).unwrap().to_string(); - let serial_number = platform_id_iter.nth(2).unwrap().to_string(); + let serial_number = platform_id_iter.nth(1).unwrap().to_string(); BaseboardId { part_number, serial_number } } diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index 209cb0af3f7..c1359644aae 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -126,11 +126,10 @@ impl EstablishedConn { // // Continuously process messages until the connection closes loop { - if !self.current_write.has_remaining() - && !self.write_queue.is_empty() - { - self.current_write = - Cursor::new(self.write_queue.pop_front().unwrap()); + if !self.current_write.has_remaining() { + if let Some(buf) = self.write_queue.pop_front() { + self.current_write = Cursor::new(buf); + } } let res = tokio::select! { @@ -178,14 +177,8 @@ impl EstablishedConn { &mut self, res: Result, ) -> Result<(), ConnErr> { - match res { - Ok(n) => { - self.total_read += n; - } - Err(e) => { - return Err(ConnErr::FailedRead(e)); - } - } + let n = res.map_err(ConnErr::FailedRead)?; + self.total_read += n; // We may have more than one message that has been read loop { diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 4110259e867..bc312ecad92 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -234,7 +234,6 @@ impl NodeTask { NodeApiRequest::Shutdown => { info!(self.log, "Shutting down Node tokio tasks"); self.shutdown = true; - self.conn_mgr.shutdown().await; } } } From a4bfea57d611c87ae9097f5c0fb04cdfc18ae5e9 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 23:36:44 +0000 Subject: [PATCH 10/22] Use BiHashMap and TriHashMap for connections --- trust-quorum/src/connection_manager.rs | 180 +++++++++++++++---------- 1 file changed, 112 insertions(+), 68 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 9b9f1a7e1ec..612bc9ac5c0 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -9,12 +9,15 @@ use crate::{BaseboardId, PeerMsg}; // TODO: Move or copy this to this crate? use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; +use iddqd::{ + BiHashItem, BiHashMap, TriHashItem, TriHashMap, bi_upcast, tri_upcast, +}; use serde::{Deserialize, Serialize}; use slog::{Logger, debug, error, info, o, warn}; use slog_error_chain::SlogInlineError; use sprockets_tls::keys::SprocketsConfig; use sprockets_tls::server::SprocketsAcceptor; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeSet; use std::net::{SocketAddr, SocketAddrV4, SocketAddrV6}; use std::time::Duration; use tokio::sync::mpsc; @@ -109,6 +112,63 @@ impl TaskHandle { } } +impl BiHashItem for TaskHandle { + type K1<'a> = task::Id; + type K2<'a> = SocketAddrV6; + + fn key1(&self) -> Self::K1<'_> { + self.task_id + } + + fn key2(&self) -> Self::K2<'_> { + self.conn_type.addr() + } + + bi_upcast!(); +} + +pub struct EstablishedTaskHandle { + baseboard_id: BaseboardId, + task_handle: TaskHandle, +} + +impl EstablishedTaskHandle { + pub fn new( + baseboard_id: BaseboardId, + task_handle: TaskHandle, + ) -> EstablishedTaskHandle { + EstablishedTaskHandle { baseboard_id, task_handle } + } + + pub fn task_id(&self) -> task::Id { + self.task_handle.task_id + } + + pub fn addr(&self) -> SocketAddrV6 { + self.task_handle.addr() + } +} + +impl TriHashItem for EstablishedTaskHandle { + type K1<'a> = &'a BaseboardId; + type K2<'a> = task::Id; + type K3<'a> = SocketAddrV6; + + fn key1(&self) -> Self::K1<'_> { + &self.baseboard_id + } + + fn key2(&self) -> Self::K2<'_> { + self.task_handle.task_id + } + + fn key3(&self) -> Self::K3<'_> { + self.task_handle.addr() + } + + tri_upcast!(); +} + pub enum ConnectionType { Connected(SocketAddrV6), Accepted(SocketAddrV6), @@ -177,14 +237,14 @@ pub struct ConnMgr { /// All tasks currently connecting to remote nodes and attempting a /// sprockets handshake. - connecting: BTreeMap, + connecting: BiHashMap, /// All tasks with an accepted TCP connnection performing a sprockets handshake - accepting: BTreeMap, + accepting: BiHashMap, /// All tasks containing established connections that can be used to communicate /// with other nodes. - established: BTreeMap, + established: TriHashMap, /// An interval for reconnect operations reconnect_interval: Interval, @@ -239,9 +299,9 @@ impl ConnMgr { listen_addr, join_set: JoinSet::new(), bootstrap_addrs: BTreeSet::new(), - connecting: BTreeMap::new(), - accepting: BTreeMap::new(), - established: BTreeMap::new(), + connecting: BiHashMap::new(), + accepting: BiHashMap::new(), + established: TriHashMap::new(), reconnect_interval, total_tasks_spawned: 0, } @@ -251,23 +311,25 @@ impl ConnMgr { let connections = self .connecting .iter() - .map(|(addr, task_handle)| ConnInfo { + .map(|task_handle| ConnInfo { state: ConnState::Connecting, - addr: *addr, + addr: task_handle.addr(), task_id: task_handle.task_id, }) - .chain(self.accepting.iter().map(|(addr, task_handle)| ConnInfo { + .chain(self.accepting.iter().map(|task_handle| ConnInfo { state: ConnState::Accepting, - addr: *addr, + addr: task_handle.addr(), task_id: task_handle.task_id, })) - .chain(self.established.iter().map( - |(baseboard_id, task_handle)| ConnInfo { - state: ConnState::Established(baseboard_id.clone()), - addr: task_handle.addr(), - task_id: task_handle.task_id, - }, - )) + .chain(self.established.iter().map(|established_task_handle| { + ConnInfo { + state: ConnState::Established( + established_task_handle.baseboard_id.clone(), + ), + addr: established_task_handle.addr(), + task_id: established_task_handle.task_id(), + } + })) .collect(); ConnMgrStatus { @@ -379,7 +441,7 @@ impl ConnMgr { tx, conn_type: ConnectionType::Accepted(addr), }; - self.accepting.insert(addr, task_handle); + assert!(self.accepting.insert_unique(task_handle).is_ok()); Ok(()) } @@ -389,7 +451,7 @@ impl ConnMgr { addr: SocketAddrV6, peer_id: BaseboardId, ) { - if let Some(task_handle) = self.accepting.remove(&addr) { + if let Some(task_handle) = self.accepting.remove2(&addr) { info!( self.log, "Established server connection"; @@ -397,9 +459,17 @@ impl ConnMgr { "peer_addr" => %addr, "peer_id" => %peer_id ); - let already_established = - self.established.insert(peer_id, task_handle); - assert!(already_established.is_none()); + + let already_established = self.established.insert_unique( + EstablishedTaskHandle::new(peer_id, task_handle), + ); + assert!(already_established.is_ok()); + } else { + error!(self.log, "Server handshake completed, but no server addr in map"; + "task_id" => ?task_id, + "peer_addr" => %addr, + "peer_id" => %peer_id + ); } } @@ -409,7 +479,7 @@ impl ConnMgr { addr: SocketAddrV6, peer_id: BaseboardId, ) { - if let Some(task_handle) = self.connecting.remove(&addr) { + if let Some(task_handle) = self.connecting.remove2(&addr) { info!( self.log, "Established client connection"; @@ -417,10 +487,10 @@ impl ConnMgr { "peer_addr" => %addr, "peer_id" => %peer_id ); - let already_established = - self.established.insert(peer_id, task_handle); - - assert!(already_established.is_none()); + let already_established = self.established.insert_unique( + EstablishedTaskHandle::new(peer_id, task_handle), + ); + assert!(already_established.is_ok()); } else { error!(self.log, "Client handshake completed, but no client addr in map"; "task_id" => ?task_id, @@ -436,14 +506,14 @@ impl ConnMgr { task_id: task::Id, peer_id: BaseboardId, ) { - if let Some(task_handle) = self.established.get(&peer_id) { - if task_handle.task_id != task_id { + if let Some(established_task_handle) = self.established.get1(&peer_id) { + if established_task_handle.task_id() != task_id { // This was a stale disconnect return; } } warn!(self.log, "peer disconnected"; "peer_id" => %peer_id); - let _ = self.established.remove(&peer_id); + let _ = self.established.remove1(&peer_id); } /// Initiate connections if a corresponding task doesn't already exist. This @@ -455,15 +525,11 @@ impl ConnMgr { for addr in self.bootstrap_addrs.iter().filter(|&&addr| self.listen_addr > addr) { - if self.connecting.contains_key(addr) { + if self.connecting.contains_key2(addr) { continue; } - if self - .established - .values() - .any(|task_handle| task_handle.addr() == *addr) - { + if self.established.contains_key3(addr) { continue; } @@ -595,7 +661,7 @@ impl ConnMgr { tx, conn_type: ConnectionType::Connected(addr), }; - self.connecting.insert(addr, task_handle); + assert!(self.connecting.insert_unique(task_handle).is_ok()); } /// Remove any information about a sprockets client connection and inform @@ -604,7 +670,7 @@ impl ConnMgr { /// We don't tear down server connections this way as we don't know their /// listen port, just the ephemeral port. async fn disconnect_client(&mut self, addr: SocketAddrV6) { - if let Some(handle) = self.connecting.remove(&addr) { + if let Some(handle) = self.connecting.remove2(&addr) { // The connection has not yet completed its handshake info!( self.log, @@ -613,21 +679,14 @@ impl ConnMgr { ); let _ = handle.tx.send(MainToConnMsg::Close).await; } else { - if let Some((id, handle)) = self - .established - .iter() - .find(|(_, handle)| handle.addr() == addr) - { + if let Some(handle) = self.established.remove3(&addr) { info!( self.log, "Deleting established connection"; "peer_addr" => %addr, - "peer_id" => %id + "peer_id" => %handle.baseboard_id ); - let _ = handle.tx.send(MainToConnMsg::Close).await; - // probably a better way to avoid borrowck issues - let id = id.clone(); - self.established.remove(&id); + let _ = handle.task_handle.tx.send(MainToConnMsg::Close).await; } } } @@ -635,43 +694,28 @@ impl ConnMgr { /// Remove any references to the given task async fn on_task_exit(&mut self, task_id: task::Id) { // We're most likely to find the task as established so we start with that - if let Some((id, handle)) = self - .established - .iter() - .find(|(_, handle)| handle.task_id == task_id) - { + if let Some(handle) = self.established.remove2(&task_id) { info!( self.log, "Established connection task exited"; "task_id" => ?task_id, "peer_addr" => %handle.addr(), - "peer_id" => %id + "peer_id" => %handle.baseboard_id ); - // probably a better way to avoid borrowck issues - let id = id.clone(); - self.established.remove(&id); - } else if let Some((addr, handle)) = - self.accepting.iter().find(|(_, handle)| handle.task_id == task_id) - { + } else if let Some(handle) = self.accepting.remove1(&task_id) { info!( self.log, "Accepting task exited"; "task_id" => ?task_id, "peer_addr" => %handle.addr() ); - let addr = *addr; - self.accepting.remove(&addr); - } else if let Some((addr, handle)) = - self.connecting.iter().find(|(_, handle)| handle.task_id == task_id) - { + } else if let Some(handle) = self.connecting.remove1(&task_id) { info!( self.log, "Connecting task exited"; "task_id" => ?task_id, "peer_addr" => %handle.addr() ); - let addr = *addr; - self.connecting.remove(&addr); } else { info!( self.log, From 2cfbfbc97a3b45c5d5c3b29f52e79bee6cdf7146 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 22 Oct 2025 23:49:07 +0000 Subject: [PATCH 11/22] No more graceful close from ConnMgr --- trust-quorum/src/connection_manager.rs | 36 +++++++++++++++++--------- trust-quorum/src/established_conn.rs | 5 ---- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 612bc9ac5c0..584e4b1a6bb 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -21,7 +21,7 @@ use std::collections::BTreeSet; use std::net::{SocketAddr, SocketAddrV4, SocketAddrV6}; use std::time::Duration; use tokio::sync::mpsc; -use tokio::task::{self, JoinSet}; +use tokio::task::{self, AbortHandle, JoinSet}; use tokio::time::{Interval, MissedTickBehavior, interval}; /// We only expect a handful of concurrent requests at most. @@ -47,7 +47,6 @@ pub enum AcceptError { /// Messages sent from the main task to the connection managing tasks #[derive(Debug, PartialEq)] pub enum MainToConnMsg { - Close, #[expect(unused)] Msg(WireMsg), } @@ -101,15 +100,24 @@ pub enum ConnToMainMsgInner { } pub struct TaskHandle { - pub task_id: task::Id, + pub abort_handle: AbortHandle, + #[expect(unused)] pub tx: mpsc::Sender, pub conn_type: ConnectionType, } impl TaskHandle { + pub fn task_id(&self) -> task::Id { + self.abort_handle.id() + } + pub fn addr(&self) -> SocketAddrV6 { self.conn_type.addr() } + + pub fn abort(&self) { + self.abort_handle.abort() + } } impl BiHashItem for TaskHandle { @@ -117,7 +125,7 @@ impl BiHashItem for TaskHandle { type K2<'a> = SocketAddrV6; fn key1(&self) -> Self::K1<'_> { - self.task_id + self.task_id() } fn key2(&self) -> Self::K2<'_> { @@ -141,12 +149,16 @@ impl EstablishedTaskHandle { } pub fn task_id(&self) -> task::Id { - self.task_handle.task_id + self.task_handle.task_id() } pub fn addr(&self) -> SocketAddrV6 { self.task_handle.addr() } + + pub fn abort(&self) { + self.task_handle.abort(); + } } impl TriHashItem for EstablishedTaskHandle { @@ -159,7 +171,7 @@ impl TriHashItem for EstablishedTaskHandle { } fn key2(&self) -> Self::K2<'_> { - self.task_handle.task_id + self.task_handle.task_id() } fn key3(&self) -> Self::K3<'_> { @@ -314,12 +326,12 @@ impl ConnMgr { .map(|task_handle| ConnInfo { state: ConnState::Connecting, addr: task_handle.addr(), - task_id: task_handle.task_id, + task_id: task_handle.task_id(), }) .chain(self.accepting.iter().map(|task_handle| ConnInfo { state: ConnState::Accepting, addr: task_handle.addr(), - task_id: task_handle.task_id, + task_id: task_handle.task_id(), })) .chain(self.established.iter().map(|established_task_handle| { ConnInfo { @@ -437,7 +449,7 @@ impl ConnMgr { }); self.total_tasks_spawned += 1; let task_handle = TaskHandle { - task_id: abort_handle.id(), + abort_handle, tx, conn_type: ConnectionType::Accepted(addr), }; @@ -657,7 +669,7 @@ impl ConnMgr { }); self.total_tasks_spawned += 1; let task_handle = TaskHandle { - task_id: abort_handle.id(), + abort_handle, tx, conn_type: ConnectionType::Connected(addr), }; @@ -677,7 +689,7 @@ impl ConnMgr { "Deleting initiating connection"; "remote_addr" => %addr ); - let _ = handle.tx.send(MainToConnMsg::Close).await; + handle.abort(); } else { if let Some(handle) = self.established.remove3(&addr) { info!( @@ -686,7 +698,7 @@ impl ConnMgr { "peer_addr" => %addr, "peer_id" => %handle.baseboard_id ); - let _ = handle.task_handle.tx.send(MainToConnMsg::Close).await; + handle.abort(); } } } diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index c1359644aae..5e4e9e12641 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -44,8 +44,6 @@ const INACTIVITY_TIMEOUT: Duration = Duration::from_secs(10); /// Also a great movie #[derive(Debug, thiserror::Error, SlogInlineError)] pub enum ConnErr { - #[error("Main task insructed this connection to close")] - Close, #[error("Failed to write")] FailedWrite(#[source] std::io::Error), #[error("Failed to read")] @@ -273,9 +271,6 @@ impl EstablishedConn { msg: MainToConnMsg, ) -> Result<(), ConnErr> { match msg { - MainToConnMsg::Close => { - return Err(ConnErr::Close); - } MainToConnMsg::Msg(msg) => self.write_framed_to_queue(msg).await, } } From bbb6cca644ca33f523aa8d4b8ff8e417bfc756d8 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 23 Oct 2025 00:13:27 +0000 Subject: [PATCH 12/22] no more test detritus --- trust-quorum/src/task.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index bc312ecad92..44fd2338eb0 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -298,7 +298,9 @@ mod tests { #[tokio::test] async fn full_mesh_connectivity() { let logctx = test_setup_log("full_mesh_connectivity"); - let (dir, _) = log_prefix_for_test("full_mesh_connectivity"); + let (mut dir, s) = log_prefix_for_test("full_mesh_connectivity"); + dir.push(&s); + std::fs::create_dir(&dir).unwrap(); println!("Writing keys and certs to {dir}"); let num_nodes = 4; @@ -327,7 +329,7 @@ mod tests { let out = attest_mock::log::mock(attest_log_doc).unwrap(); std::fs::write(dir.join("log.bin"), &out).unwrap(); - let configs = pki_doc_to_node_configs(dir, num_nodes); + let configs = pki_doc_to_node_configs(dir.clone(), num_nodes); let mut node_handles = vec![]; let mut join_handles = vec![]; @@ -471,5 +473,6 @@ mod tests { .unwrap(); logctx.cleanup_successful(); + std::fs::remove_dir_all(dir).unwrap(); } } From 20b41008fda9c4d6df95f31822347afa46eb5846 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 23 Oct 2025 03:56:26 +0000 Subject: [PATCH 13/22] Move sans-io code into trust-quorum-protocol crate --- Cargo.lock | 46 ++++- Cargo.toml | 3 + trust-quorum/Cargo.toml | 13 +- trust-quorum/protocol/Cargo.toml | 57 ++++++ trust-quorum/{ => protocol}/src/alarm.rs | 0 .../{ => protocol}/src/compute_key_share.rs | 0 .../{ => protocol}/src/configuration.rs | 0 .../{ => protocol}/src/coordinator_state.rs | 0 trust-quorum/{ => protocol}/src/crypto.rs | 0 trust-quorum/protocol/src/lib.rs | 162 ++++++++++++++++++ trust-quorum/{ => protocol}/src/messages.rs | 0 trust-quorum/{ => protocol}/src/node.rs | 0 trust-quorum/{ => protocol}/src/node_ctx.rs | 0 .../{ => protocol}/src/persistent_state.rs | 0 .../{ => protocol}/src/rack_secret_loader.rs | 0 trust-quorum/{ => protocol}/src/validators.rs | 0 .../tests/cluster.proptest-regressions | 0 trust-quorum/{ => protocol}/tests/cluster.rs | 2 +- trust-quorum/src/connection_manager.rs | 36 ++-- trust-quorum/src/established_conn.rs | 5 +- trust-quorum/src/lib.rs | 159 +---------------- trust-quorum/src/task.rs | 5 +- trust-quorum/test-utils/Cargo.toml | 2 +- trust-quorum/test-utils/src/event.rs | 2 +- trust-quorum/test-utils/src/lib.rs | 2 +- trust-quorum/test-utils/src/nexus.rs | 2 +- trust-quorum/test-utils/src/state.rs | 2 +- trust-quorum/tqdb/Cargo.toml | 2 +- trust-quorum/tqdb/src/bin/tqdb/main.rs | 2 +- 29 files changed, 308 insertions(+), 194 deletions(-) create mode 100644 trust-quorum/protocol/Cargo.toml rename trust-quorum/{ => protocol}/src/alarm.rs (100%) rename trust-quorum/{ => protocol}/src/compute_key_share.rs (100%) rename trust-quorum/{ => protocol}/src/configuration.rs (100%) rename trust-quorum/{ => protocol}/src/coordinator_state.rs (100%) rename trust-quorum/{ => protocol}/src/crypto.rs (100%) create mode 100644 trust-quorum/protocol/src/lib.rs rename trust-quorum/{ => protocol}/src/messages.rs (100%) rename trust-quorum/{ => protocol}/src/node.rs (100%) rename trust-quorum/{ => protocol}/src/node_ctx.rs (100%) rename trust-quorum/{ => protocol}/src/persistent_state.rs (100%) rename trust-quorum/{ => protocol}/src/rack_secret_loader.rs (100%) rename trust-quorum/{ => protocol}/src/validators.rs (100%) rename trust-quorum/{ => protocol}/tests/cluster.proptest-regressions (100%) rename trust-quorum/{ => protocol}/tests/cluster.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index c01db75e58e..45b80516196 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14641,7 +14641,7 @@ dependencies = [ "serde_json", "slog", "tabled 0.15.0", - "trust-quorum", + "trust-quorum-protocol", "trust-quorum-test-utils", ] @@ -14812,7 +14812,6 @@ dependencies = [ "anyhow", "assert_matches", "attest-mock", - "bcs", "bootstore", "bytes", "camino", @@ -14846,6 +14845,47 @@ dependencies = [ "test-strategy", "thiserror 2.0.17", "tokio", + "trust-quorum-protocol", + "trust-quorum-test-utils", + "uuid", + "zeroize", +] + +[[package]] +name = "trust-quorum-protocol" +version = "0.1.0" +dependencies = [ + "assert_matches", + "attest-mock", + "bootstore", + "bytes", + "camino", + "chacha20poly1305", + "ciborium", + "daft", + "derive_more 0.99.20", + "dropshot", + "gfss", + "hex", + "hkdf", + "iddqd", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "proptest", + "rand 0.9.2", + "secrecy 0.10.3", + "serde", + "serde_json", + "serde_with", + "sha3", + "sled-agent-types", + "slog", + "slog-error-chain", + "static_assertions", + "subtle", + "test-strategy", + "thiserror 2.0.17", "trust-quorum-test-utils", "uuid", "zeroize", @@ -14868,7 +14908,7 @@ dependencies = [ "serde_json", "sled-hardware-types", "slog", - "trust-quorum", + "trust-quorum-protocol", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index c52388ef37f..cfd808dce52 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -143,6 +143,7 @@ members = [ "test-utils", "trust-quorum", "trust-quorum/gfss", + "trust-quorum/protocol", "trust-quorum/test-utils", "trust-quorum/tqdb", "typed-rng", @@ -304,6 +305,7 @@ default-members = [ "sp-sim", "trust-quorum", "trust-quorum/gfss", + "trust-quorum/protocol", "trust-quorum/test-utils", "trust-quorum/tqdb", "test-utils", @@ -472,6 +474,7 @@ gateway-types = { path = "gateway-types" } gethostname = "0.5.0" gfss = { path = "trust-quorum/gfss" } trust-quorum = { path = "trust-quorum" } +trust-quorum-protocol = { path = "trust-quorum/protocol" } trust-quorum-test-utils = { path = "trust-quorum/test-utils" } glob = "0.3.2" guppy = "0.17.20" diff --git a/trust-quorum/Cargo.toml b/trust-quorum/Cargo.toml index 7a034f78f8d..5f5ad0e88a8 100644 --- a/trust-quorum/Cargo.toml +++ b/trust-quorum/Cargo.toml @@ -3,13 +3,13 @@ name = "trust-quorum" version = "0.1.0" edition = "2021" license = "MPL-2.0" +description = "trust quorum library for use by bootstrap agent" [lints] workspace = true [dependencies] anyhow.workspace = true -bcs.workspace = true bootstore.workspace = true bytes.workspace = true camino.workspace = true @@ -36,6 +36,7 @@ static_assertions.workspace = true subtle.workspace = true thiserror.workspace = true tokio.workspace = true +trust-quorum-protocol.workspace = true uuid.workspace = true zeroize.workspace = true omicron-workspace-hack.workspace = true @@ -50,13 +51,3 @@ serde_json.workspace = true test-strategy.workspace = true trust-quorum-test-utils.workspace = true sprockets-tls-test-utils.workspace = true - -[features] -# Impl `PartialEq` and `Eq` for types implementing `subtle::ConstantTimeEq` when -# this feature is enabled. -# -# This is of unknown risk. The rust compiler may obviate the security of using -# subtle when we do this. On the other hand its very useful for testing and -# debugging outside of production. -danger_partial_eq_ct_wrapper = ["gfss/danger_partial_eq_ct_wrapper"] -testing = [] diff --git a/trust-quorum/protocol/Cargo.toml b/trust-quorum/protocol/Cargo.toml new file mode 100644 index 00000000000..9a5d42f7d95 --- /dev/null +++ b/trust-quorum/protocol/Cargo.toml @@ -0,0 +1,57 @@ +[package] +name = "trust-quorum-protocol" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" +description = "sans-io trust quorum protocol implementation" + +[lints] +workspace = true + +[dependencies] +bootstore.workspace = true +bytes.workspace = true +camino.workspace = true +chacha20poly1305.workspace = true +ciborium.workspace = true +daft.workspace = true +derive_more.workspace = true +gfss.workspace = true +hex.workspace = true +hkdf.workspace = true +iddqd.workspace = true +omicron-uuid-kinds.workspace = true +rand = { workspace = true, features = ["os_rng"] } +secrecy.workspace = true +serde.workspace = true +serde_with.workspace = true +sha3.workspace = true +sled-agent-types.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +static_assertions.workspace = true +subtle.workspace = true +thiserror.workspace = true +uuid.workspace = true +zeroize.workspace = true +omicron-workspace-hack.workspace = true + +[dev-dependencies] +assert_matches.workspace = true +attest-mock.workspace = true +dropshot.workspace = true +omicron-test-utils.workspace = true +proptest.workspace = true +serde_json.workspace = true +test-strategy.workspace = true +trust-quorum-test-utils.workspace = true + +[features] +# Impl `PartialEq` and `Eq` for types implementing `subtle::ConstantTimeEq` when +# this feature is enabled. +# +# This is of unknown risk. The rust compiler may obviate the security of using +# subtle when we do this. On the other hand its very useful for testing and +# debugging outside of production. +danger_partial_eq_ct_wrapper = ["gfss/danger_partial_eq_ct_wrapper"] +testing = [] diff --git a/trust-quorum/src/alarm.rs b/trust-quorum/protocol/src/alarm.rs similarity index 100% rename from trust-quorum/src/alarm.rs rename to trust-quorum/protocol/src/alarm.rs diff --git a/trust-quorum/src/compute_key_share.rs b/trust-quorum/protocol/src/compute_key_share.rs similarity index 100% rename from trust-quorum/src/compute_key_share.rs rename to trust-quorum/protocol/src/compute_key_share.rs diff --git a/trust-quorum/src/configuration.rs b/trust-quorum/protocol/src/configuration.rs similarity index 100% rename from trust-quorum/src/configuration.rs rename to trust-quorum/protocol/src/configuration.rs diff --git a/trust-quorum/src/coordinator_state.rs b/trust-quorum/protocol/src/coordinator_state.rs similarity index 100% rename from trust-quorum/src/coordinator_state.rs rename to trust-quorum/protocol/src/coordinator_state.rs diff --git a/trust-quorum/src/crypto.rs b/trust-quorum/protocol/src/crypto.rs similarity index 100% rename from trust-quorum/src/crypto.rs rename to trust-quorum/protocol/src/crypto.rs diff --git a/trust-quorum/protocol/src/lib.rs b/trust-quorum/protocol/src/lib.rs new file mode 100644 index 00000000000..0d5c522b2d4 --- /dev/null +++ b/trust-quorum/protocol/src/lib.rs @@ -0,0 +1,162 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of the oxide rack trust quorum protocol +//! +//! This protocol is written as a +//! [no-IO](https://sans-io.readthedocs.io/how-to-sans-io.html) implementation. +//! All persistent state and all networking is managed outside of this +//! implementation. + +use crypto::Sha3_256Digest; +use daft::Diffable; +use derive_more::Display; +use gfss::shamir::Share; +use serde::{Deserialize, Serialize}; +pub use sled_agent_types::sled::BaseboardId; +use slog::{Logger, error, warn}; + +mod alarm; +mod compute_key_share; +mod configuration; +mod coordinator_state; +pub(crate) mod crypto; +mod messages; +mod node; +mod node_ctx; +mod persistent_state; +#[allow(unused)] +mod rack_secret_loader; +mod validators; + +pub use configuration::Configuration; +pub use coordinator_state::{ + CoordinatingMsg, CoordinatorOperation, CoordinatorState, + CoordinatorStateDiff, +}; +pub use rack_secret_loader::{LoadRackSecretError, RackSecretLoaderDiff}; +pub use validators::{ + ValidatedLrtqUpgradeMsgDiff, ValidatedReconfigureMsgDiff, +}; + +pub use alarm::Alarm; +pub use crypto::RackSecret; +pub use messages::*; +pub use node::{Node, NodeDiff}; +// public only for docs. +pub use node_ctx::NodeHandlerCtx; +pub use node_ctx::{NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff}; +pub use persistent_state::{ + ExpungedMetadata, PersistentState, PersistentStateSummary, +}; + +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + Display, + Diffable, +)] +#[daft(leaf)] +pub struct Epoch(pub u64); + +impl Epoch { + pub fn next(&self) -> Epoch { + Epoch(self.0.checked_add(1).expect("fewer than 2^64 epochs")) + } +} + +/// The number of shares required to reconstruct the rack secret +/// +/// Typically referred to as `k` in the docs +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Serialize, + Deserialize, + Display, + Diffable, +)] +#[daft(leaf)] +pub struct Threshold(pub u8); + +/// A container to make messages between trust quorum nodes routable +#[derive(Debug, Clone, Serialize, Deserialize, Diffable)] +#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] +#[daft(leaf)] +pub struct Envelope { + pub to: BaseboardId, + pub from: BaseboardId, + pub msg: PeerMsg, +} + +#[cfg(feature = "testing")] +impl Envelope { + pub fn equal_except_for_crypto_data(&self, other: &Self) -> bool { + self.to == other.to + && self.from == other.from + && self.msg.equal_except_for_crypto_data(&other.msg) + } +} + +/// Check if a received share is valid for a given configuration +/// +/// Return true if valid, false otherwise. +pub fn validate_share( + log: &Logger, + config: &Configuration, + from: &BaseboardId, + epoch: Epoch, + share: &Share, +) -> bool { + // Are we trying to retrieve shares for `epoch`? + if epoch != config.epoch { + warn!( + log, + "Received Share from node with wrong epoch"; + "received_epoch" => %epoch, + "from" => %from + ); + return false; + } + + // Is the sender a member of the configuration `epoch`? + // Was the sender a member of the configuration at `old_epoch`? + let Some(expected_digest) = config.members.get(&from) else { + warn!( + log, + "Received Share from unexpected node"; + "epoch" => %epoch, + "from" => %from + ); + return false; + }; + + // Does the share hash match what we expect? + let mut digest = Sha3_256Digest::default(); + share.digest::(&mut digest.0); + if digest != *expected_digest { + error!( + log, + "Received share with invalid digest"; + "epoch" => %epoch, + "from" => %from + ); + return false; + } + + true +} diff --git a/trust-quorum/src/messages.rs b/trust-quorum/protocol/src/messages.rs similarity index 100% rename from trust-quorum/src/messages.rs rename to trust-quorum/protocol/src/messages.rs diff --git a/trust-quorum/src/node.rs b/trust-quorum/protocol/src/node.rs similarity index 100% rename from trust-quorum/src/node.rs rename to trust-quorum/protocol/src/node.rs diff --git a/trust-quorum/src/node_ctx.rs b/trust-quorum/protocol/src/node_ctx.rs similarity index 100% rename from trust-quorum/src/node_ctx.rs rename to trust-quorum/protocol/src/node_ctx.rs diff --git a/trust-quorum/src/persistent_state.rs b/trust-quorum/protocol/src/persistent_state.rs similarity index 100% rename from trust-quorum/src/persistent_state.rs rename to trust-quorum/protocol/src/persistent_state.rs diff --git a/trust-quorum/src/rack_secret_loader.rs b/trust-quorum/protocol/src/rack_secret_loader.rs similarity index 100% rename from trust-quorum/src/rack_secret_loader.rs rename to trust-quorum/protocol/src/rack_secret_loader.rs diff --git a/trust-quorum/src/validators.rs b/trust-quorum/protocol/src/validators.rs similarity index 100% rename from trust-quorum/src/validators.rs rename to trust-quorum/protocol/src/validators.rs diff --git a/trust-quorum/tests/cluster.proptest-regressions b/trust-quorum/protocol/tests/cluster.proptest-regressions similarity index 100% rename from trust-quorum/tests/cluster.proptest-regressions rename to trust-quorum/protocol/tests/cluster.proptest-regressions diff --git a/trust-quorum/tests/cluster.rs b/trust-quorum/protocol/tests/cluster.rs similarity index 99% rename from trust-quorum/tests/cluster.rs rename to trust-quorum/protocol/tests/cluster.rs index 39c1367661b..e1b14994dd4 100644 --- a/trust-quorum/tests/cluster.rs +++ b/trust-quorum/protocol/tests/cluster.rs @@ -15,7 +15,7 @@ use secrecy::ExposeSecret; use slog::{Logger, info, o}; use std::collections::BTreeSet; use test_strategy::{Arbitrary, proptest}; -use trust_quorum::{ +use trust_quorum_protocol::{ BaseboardId, CoordinatorOperation, Epoch, NodeCallerCtx, NodeCommonCtx, Threshold, }; diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 584e4b1a6bb..2feae008779 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -5,7 +5,7 @@ //! A mechanism for maintaining a full mesh of trust quorum node connections use crate::established_conn::EstablishedConn; -use crate::{BaseboardId, PeerMsg}; +use trust_quorum_protocol::{BaseboardId, PeerMsg}; // TODO: Move or copy this to this crate? use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; @@ -45,7 +45,7 @@ pub enum AcceptError { } /// Messages sent from the main task to the connection managing tasks -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub enum MainToConnMsg { #[expect(unused)] Msg(WireMsg), @@ -58,7 +58,7 @@ pub enum MainToConnMsg { /// /// All `WireMsg`s sent between nodes is prefixed with a 4 byte size header used /// for framing. -#[derive(Debug, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub enum WireMsg { /// Used for connection keep alive Ping, @@ -84,19 +84,35 @@ pub enum WireMsg { /// We include `task_id` to differentiate which task they come from so we can /// exclude requests from tasks that have been cancelled or have been told to /// shutdown. -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub struct ConnToMainMsg { pub task_id: task::Id, pub msg: ConnToMainMsgInner, } -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub enum ConnToMainMsgInner { - Accepted { addr: SocketAddrV6, peer_id: BaseboardId }, - Connected { addr: SocketAddrV6, peer_id: BaseboardId }, - Received { from: BaseboardId, msg: PeerMsg }, - ReceivedNetworkConfig { from: BaseboardId, config: NetworkConfig }, - Disconnected { peer_id: BaseboardId }, + Accepted { + addr: SocketAddrV6, + peer_id: BaseboardId, + }, + Connected { + addr: SocketAddrV6, + peer_id: BaseboardId, + }, + #[expect(unused)] + Received { + from: BaseboardId, + msg: PeerMsg, + }, + #[expect(unused)] + ReceivedNetworkConfig { + from: BaseboardId, + config: NetworkConfig, + }, + Disconnected { + peer_id: BaseboardId, + }, } pub struct TaskHandle { diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index 5e4e9e12641..77cb2d73cbd 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -4,9 +4,7 @@ //! An individual sprockets connection running in its own task -use crate::{ - BaseboardId, ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg, -}; +use crate::{ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg}; use bytes::Buf; use serde::Serialize; use slog::{Logger, debug, error, o, warn}; @@ -19,6 +17,7 @@ use tokio::net::TcpStream; use tokio::sync::mpsc; use tokio::task; use tokio::time::{Instant, MissedTickBehavior, interval}; +use trust_quorum_protocol::BaseboardId; /// Max buffer size of a connection const CONN_BUF_SIZE: usize = 1024 * 1024; diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index 457403999e4..f508647c889 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -2,168 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Implementation of the oxide rack trust quorum protocol -//! -//! This protocol is written as a -//! [no-IO](https://sans-io.readthedocs.io/how-to-sans-io.html) implementation. -//! All persistent state and all networking is managed outside of this -//! implementation. +//! Async trust-quorum library code for intergrating with sled-agent -use crypto::Sha3_256Digest; -use daft::Diffable; -use derive_more::Display; -use gfss::shamir::Share; -use serde::{Deserialize, Serialize}; -pub use sled_agent_types::sled::BaseboardId; -use slog::{Logger, error, warn}; - -mod compute_key_share; -mod configuration; -mod coordinator_state; -pub(crate) mod crypto; -pub(crate) mod established_conn; -mod messages; -mod node; -mod node_ctx; -mod persistent_state; -#[allow(unused)] -mod rack_secret_loader; -mod validators; -pub use configuration::Configuration; -pub use coordinator_state::{ - CoordinatingMsg, CoordinatorOperation, CoordinatorState, - CoordinatorStateDiff, -}; -pub use rack_secret_loader::{LoadRackSecretError, RackSecretLoaderDiff}; -pub use validators::{ - ValidatedLrtqUpgradeMsgDiff, ValidatedReconfigureMsgDiff, -}; -mod alarm; mod connection_manager; +pub(crate) mod established_conn; mod task; pub(crate) use connection_manager::{ ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg, }; pub use task::NodeTask; - -pub use alarm::Alarm; -pub use crypto::RackSecret; -pub use messages::*; -pub use node::{Node, NodeDiff}; -// public only for docs. -pub use node_ctx::NodeHandlerCtx; -pub use node_ctx::{NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff}; -pub use persistent_state::{ - ExpungedMetadata, PersistentState, PersistentStateSummary, -}; - -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - PartialOrd, - Ord, - Hash, - Serialize, - Deserialize, - Display, - Diffable, -)] -#[daft(leaf)] -pub struct Epoch(pub u64); - -impl Epoch { - pub fn next(&self) -> Epoch { - Epoch(self.0.checked_add(1).expect("fewer than 2^64 epochs")) - } -} - -/// The number of shares required to reconstruct the rack secret -/// -/// Typically referred to as `k` in the docs -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - PartialOrd, - Ord, - Serialize, - Deserialize, - Display, - Diffable, -)] -#[daft(leaf)] -pub struct Threshold(pub u8); - -/// A container to make messages between trust quorum nodes routable -#[derive(Debug, Clone, Serialize, Deserialize, Diffable)] -#[cfg_attr(feature = "danger_partial_eq_ct_wrapper", derive(PartialEq, Eq))] -#[daft(leaf)] -pub struct Envelope { - pub to: BaseboardId, - pub from: BaseboardId, - pub msg: PeerMsg, -} - -#[cfg(feature = "testing")] -impl Envelope { - pub fn equal_except_for_crypto_data(&self, other: &Self) -> bool { - self.to == other.to - && self.from == other.from - && self.msg.equal_except_for_crypto_data(&other.msg) - } -} - -/// Check if a received share is valid for a given configuration -/// -/// Return true if valid, false otherwise. -pub fn validate_share( - log: &Logger, - config: &Configuration, - from: &BaseboardId, - epoch: Epoch, - share: &Share, -) -> bool { - // Are we trying to retrieve shares for `epoch`? - if epoch != config.epoch { - warn!( - log, - "Received Share from node with wrong epoch"; - "received_epoch" => %epoch, - "from" => %from - ); - return false; - } - - // Is the sender a member of the configuration `epoch`? - // Was the sender a member of the configuration at `old_epoch`? - let Some(expected_digest) = config.members.get(&from) else { - warn!( - log, - "Received Share from unexpected node"; - "epoch" => %epoch, - "from" => %from - ); - return false; - }; - - // Does the share hash match what we expect? - let mut digest = Sha3_256Digest::default(); - share.digest::(&mut digest.0); - if digest != *expected_digest { - error!( - log, - "Received share with invalid digest"; - "epoch" => %epoch, - "from" => %from - ); - return false; - } - - true -} diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 44fd2338eb0..9ac34c80470 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -2,12 +2,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! A runnable async trust quorum node that wraps the sans-io [`crate::Node`] +//! A runnable async trust quorum node that wraps the sans-io +//! [`trust_quorum_protocol::Node`] use crate::connection_manager::{ ConnMgr, ConnMgrStatus, ConnToMainMsg, ConnToMainMsgInner, }; -use crate::{BaseboardId, Node, NodeCtx}; use slog::{Logger, debug, error, info, o}; use sprockets_tls::keys::SprocketsConfig; use std::collections::BTreeSet; @@ -16,6 +16,7 @@ use thiserror::Error; use tokio::sync::mpsc::error::SendError; use tokio::sync::oneshot::error::RecvError; use tokio::sync::{mpsc, oneshot}; +use trust_quorum_protocol::{BaseboardId, Node, NodeCtx}; #[derive(Debug, Clone)] pub struct Config { diff --git a/trust-quorum/test-utils/Cargo.toml b/trust-quorum/test-utils/Cargo.toml index 33181dc1ddf..853bdd464f7 100644 --- a/trust-quorum/test-utils/Cargo.toml +++ b/trust-quorum/test-utils/Cargo.toml @@ -19,6 +19,6 @@ serde.workspace = true serde_json.workspace = true sled-hardware-types.workspace = true slog.workspace = true -trust-quorum = { workspace = true, features = ["danger_partial_eq_ct_wrapper", "testing"] } +trust-quorum-protocol = { workspace = true, features = ["danger_partial_eq_ct_wrapper", "testing"] } omicron-workspace-hack.workspace = true diff --git a/trust-quorum/test-utils/src/event.rs b/trust-quorum/test-utils/src/event.rs index 3544456b3c3..8bead3c53e2 100644 --- a/trust-quorum/test-utils/src/event.rs +++ b/trust-quorum/test-utils/src/event.rs @@ -7,7 +7,7 @@ use crate::nexus::{NexusConfig, NexusReply}; use serde::{Deserialize, Serialize}; use std::collections::BTreeSet; -use trust_quorum::{BaseboardId, Envelope, Epoch}; +use trust_quorum_protocol::{BaseboardId, Envelope, Epoch}; /// An event that can be fed into our system under test (SUT) /// diff --git a/trust-quorum/test-utils/src/lib.rs b/trust-quorum/test-utils/src/lib.rs index 6cc7d617f97..9bfffdde256 100644 --- a/trust-quorum/test-utils/src/lib.rs +++ b/trust-quorum/test-utils/src/lib.rs @@ -13,7 +13,7 @@ pub use event::Event; pub use event_log::EventLog; pub use state::TqState; -use trust_quorum::BaseboardId; +use trust_quorum_protocol::BaseboardId; /// All possible members used in a test pub fn member_universe(size: usize) -> Vec { diff --git a/trust-quorum/test-utils/src/nexus.rs b/trust-quorum/test-utils/src/nexus.rs index d59ec53cc9c..c2665f37870 100644 --- a/trust-quorum/test-utils/src/nexus.rs +++ b/trust-quorum/test-utils/src/nexus.rs @@ -10,7 +10,7 @@ use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; use omicron_uuid_kinds::RackUuid; use serde::{Deserialize, Serialize}; use std::collections::BTreeSet; -use trust_quorum::{ +use trust_quorum_protocol::{ BaseboardId, Epoch, LrtqUpgradeMsg, ReconfigureMsg, Threshold, }; diff --git a/trust-quorum/test-utils/src/state.rs b/trust-quorum/test-utils/src/state.rs index 59b8524d8b7..3c1b31e5a32 100644 --- a/trust-quorum/test-utils/src/state.rs +++ b/trust-quorum/test-utils/src/state.rs @@ -17,7 +17,7 @@ use sled_hardware_types::Baseboard; use slog::{Logger, info}; use std::collections::{BTreeMap, BTreeSet}; use std::fmt::Display; -use trust_quorum::{ +use trust_quorum_protocol::{ BaseboardId, Configuration, CoordinatingMsg, CoordinatorOperation, CoordinatorStateDiff, Envelope, Epoch, LoadRackSecretError, Node, NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff, NodeDiff, PeerMsgKind, diff --git a/trust-quorum/tqdb/Cargo.toml b/trust-quorum/tqdb/Cargo.toml index 4436cc99fbc..18242508aa1 100644 --- a/trust-quorum/tqdb/Cargo.toml +++ b/trust-quorum/tqdb/Cargo.toml @@ -20,7 +20,7 @@ reconfigurator-cli.workspace = true serde_json.workspace = true slog.workspace = true tabled.workspace = true -trust-quorum = { workspace = true, features = ["danger_partial_eq_ct_wrapper"] } +trust-quorum-protocol = { workspace = true, features = ["danger_partial_eq_ct_wrapper"] } trust-quorum-test-utils.workspace = true omicron-workspace-hack.workspace = true diff --git a/trust-quorum/tqdb/src/bin/tqdb/main.rs b/trust-quorum/tqdb/src/bin/tqdb/main.rs index a593e697e3b..12e163f801b 100644 --- a/trust-quorum/tqdb/src/bin/tqdb/main.rs +++ b/trust-quorum/tqdb/src/bin/tqdb/main.rs @@ -24,7 +24,7 @@ use std::fmt::Write; use std::fs; use std::io::IsTerminal; use tabled::Tabled; -use trust_quorum::BaseboardId; +use trust_quorum_protocol::BaseboardId; use trust_quorum_test_utils::{Event, TqState}; fn main() -> Result<(), anyhow::Error> { From b374e072f2df107f6e0f67313dc70e607a726406 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 23 Oct 2025 03:58:51 +0000 Subject: [PATCH 14/22] hakari --- Cargo.lock | 2 ++ workspace-hack/Cargo.toml | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 45b80516196..3848089b025 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8907,6 +8907,7 @@ dependencies = [ "clang-sys", "clap", "clap_builder", + "const-oid", "cookie", "crossbeam-epoch", "crossbeam-utils", @@ -8915,6 +8916,7 @@ dependencies = [ "curve25519-dalek", "daft", "data-encoding", + "der", "digest", "dof 0.3.0", "dof 0.4.0", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 096d5d26dcd..6ef9f57bbac 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -35,6 +35,7 @@ chrono = { version = "0.4.42", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } clap = { version = "4.5.48", features = ["cargo", "derive", "env", "wrap_help"] } clap_builder = { version = "4.5.48", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } +const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } crossbeam-utils = { version = "0.8.21" } crossterm = { version = "0.28.1", features = ["event-stream", "serde"] } @@ -42,6 +43,7 @@ crypto-common = { version = "0.1.6", default-features = false, features = ["getr curve25519-dalek = { version = "4.1.3", features = ["digest", "legacy_compatibility", "rand_core"] } daft = { version = "0.1.4", features = ["derive", "newtype-uuid1", "oxnet01", "uuid1"] } data-encoding = { version = "2.9.0" } +der = { version = "0.7.10", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } ecdsa = { version = "0.16.9", features = ["pem", "signing", "std", "verifying"] } ed25519-dalek = { version = "2.1.1", features = ["digest", "pem", "rand_core"] } @@ -174,6 +176,7 @@ chrono = { version = "0.4.42", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } clap = { version = "4.5.48", features = ["cargo", "derive", "env", "wrap_help"] } clap_builder = { version = "4.5.48", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } +const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } crossbeam-utils = { version = "0.8.21" } crossterm = { version = "0.28.1", features = ["event-stream", "serde"] } @@ -181,6 +184,7 @@ crypto-common = { version = "0.1.6", default-features = false, features = ["getr curve25519-dalek = { version = "4.1.3", features = ["digest", "legacy_compatibility", "rand_core"] } daft = { version = "0.1.4", features = ["derive", "newtype-uuid1", "oxnet01", "uuid1"] } data-encoding = { version = "2.9.0" } +der = { version = "0.7.10", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } ecdsa = { version = "0.16.9", features = ["pem", "signing", "std", "verifying"] } ed25519-dalek = { version = "2.1.1", features = ["digest", "pem", "rand_core"] } From 1efff120b97a71d58b8fc12b23d965ceb99925e4 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 28 Oct 2025 16:37:44 +0000 Subject: [PATCH 15/22] use try_send on connToMain channel --- trust-quorum/src/connection_manager.rs | 45 ++++++++----------- trust-quorum/src/established_conn.rs | 62 +++++++++++--------------- trust-quorum/src/task.rs | 22 ++++++--- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 2feae008779..1b764ffbd67 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -446,19 +446,16 @@ impl ConnMgr { ); // Inform the main task that accepted connection is established - if let Err(e) = main_tx - .send(ConnToMainMsg { - task_id: task::id(), - msg: ConnToMainMsgInner::Accepted { - addr, - peer_id: baseboard_id, - }, - }) - .await - { - // The system is shutting down + if let Err(_) = main_tx.try_send(ConnToMainMsg { + task_id: task::id(), + msg: ConnToMainMsgInner::Accepted { + addr, + peer_id: baseboard_id, + }, + }) { + // The system is shutting down or we've overloaded the main channel // Just bail from this task - warn!(log, "Failed to send 'accepted' msg to main task: {e:?}"); + warn!(log, "Failed to send 'accepted' msg to main task"); } else { conn.run().await; } @@ -663,22 +660,16 @@ impl ConnMgr { ); // Inform the main task that the client connection is // established. - if let Err(e) = main_tx - .send(ConnToMainMsg { - task_id: task::id(), - msg: ConnToMainMsgInner::Connected { - addr, - peer_id: baseboard_id, - }, - }) - .await - { - // The system is shutting down + if let Err(_) = main_tx.try_send(ConnToMainMsg { + task_id: task::id(), + msg: ConnToMainMsgInner::Connected { + addr, + peer_id: baseboard_id, + }, + }) { + // The system is shutting down or we've overloaded the main channel // Just bail from this task - error!( - log, - "Failed to send 'connected' msg to main task: {e:?}" - ); + error!(log, "Failed to send 'connected' msg to main task"); } else { conn.run().await; } diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index 77cb2d73cbd..87b48055a31 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -155,17 +155,13 @@ impl EstablishedConn { } async fn close(&mut self) { - if let Err(e) = self - .main_tx - .send(ConnToMainMsg { - task_id: self.task_id, - msg: ConnToMainMsgInner::Disconnected { - peer_id: self.peer_id.clone(), - }, - }) - .await - { - warn!(self.log, "Failed to send to main task: {e:?}"); + if let Err(_) = self.main_tx.try_send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::Disconnected { + peer_id: self.peer_id.clone(), + }, + }) { + warn!(self.log, "Failed to send to main task"); } let _ = self.writer.shutdown().await; } @@ -202,21 +198,18 @@ impl EstablishedConn { debug!(self.log, "Received {msg:?}"); match msg { WireMsg::Tq(msg) => { - if let Err(e) = self - .main_tx - .send(ConnToMainMsg { - task_id: self.task_id, - msg: ConnToMainMsgInner::Received { - from: self.peer_id.clone(), - msg, - }, - }) - .await - { - warn!( + if let Err(_) = self.main_tx.try_send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::Received { + from: self.peer_id.clone(), + msg, + }, + }) { + error!( self.log, - "Failed to send received fsm msg to main task: {e:?}" + "Failed to send received fsm msg to main task" ); + panic!("Connection to main task channel full"); } } WireMsg::Ping => { @@ -225,22 +218,19 @@ impl EstablishedConn { } WireMsg::NetworkConfig(config) => { let generation = config.generation; - if let Err(e) = self - .main_tx - .send(ConnToMainMsg { - task_id: self.task_id, - msg: ConnToMainMsgInner::ReceivedNetworkConfig { - from: self.peer_id.clone(), - config, - }, - }) - .await - { + if let Err(_) = self.main_tx.try_send(ConnToMainMsg { + task_id: self.task_id, + msg: ConnToMainMsgInner::ReceivedNetworkConfig { + from: self.peer_id.clone(), + config, + }, + }) { warn!( self.log, "Failed to send received NetworkConfig with - generation {generation} to main task: {e:?}" + generation {generation} to main task" ); + panic!("Connection to main task channnel full"); } } } diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 9ac34c80470..6e821f197db 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -18,6 +18,18 @@ use tokio::sync::oneshot::error::RecvError; use tokio::sync::{mpsc, oneshot}; use trust_quorum_protocol::{BaseboardId, Node, NodeCtx}; +/// We only expect a handful of messages at a time. +const API_CHANNEL_BOUND: usize = 32; + +/// We size this bound large enough that it should never be hit. Up to 31 +/// `EstablishedConn` tasks can send messages to the main task simultaneously when +/// messages are received. +/// +/// We use `try_send.unwrap()` when sending to the main task to prevent deadlock +/// and inform us via panic that something has gone seriously wrong. This is +/// similar to using an unbounded channel but will not use all possible memory. +const CONN_TO_MAIN_CHANNEL_BOUND: usize = 1024; + #[derive(Debug, Clone)] pub struct Config { pub baseboard_id: BaseboardId, @@ -130,13 +142,11 @@ impl NodeTask { "component" => "trust-quorum", "baseboard_id" => config.baseboard_id.to_string() )); - // We only expect one outstanding request at a time for `Init_` or - // `LoadRackSecret` requests, We can have one of those requests in - // flight while allowing `PeerAddresses` updates. We also allow status - // requests in parallel. Just leave some room. - let (tx, rx) = mpsc::channel(10); - let (conn_mgr_tx, conn_mgr_rx) = mpsc::channel(100); + let (tx, rx) = mpsc::channel(API_CHANNEL_BOUND); + + let (conn_mgr_tx, conn_mgr_rx) = + mpsc::channel(CONN_TO_MAIN_CHANNEL_BOUND); let baseboard_id = config.baseboard_id.clone(); From 2a262c0d0de4df0fbc6e1eb331206c81981ac566 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 28 Oct 2025 16:39:39 +0000 Subject: [PATCH 16/22] error instead of warn --- trust-quorum/src/established_conn.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trust-quorum/src/established_conn.rs b/trust-quorum/src/established_conn.rs index 87b48055a31..b75b0576949 100644 --- a/trust-quorum/src/established_conn.rs +++ b/trust-quorum/src/established_conn.rs @@ -225,7 +225,7 @@ impl EstablishedConn { config, }, }) { - warn!( + error!( self.log, "Failed to send received NetworkConfig with generation {generation} to main task" From 99e5192dbcc815e8c259f23e702dfd41c9b5da68 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 31 Oct 2025 15:33:48 +0000 Subject: [PATCH 17/22] put errors back --- trust-quorum/src/connection_manager.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index 1b764ffbd67..d2b427fd6eb 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -6,8 +6,11 @@ use crate::established_conn::EstablishedConn; use trust_quorum_protocol::{BaseboardId, PeerMsg}; -// TODO: Move or copy this to this crate? + +// TODO: Move to this crate +// https://github.com/oxidecomputer/omicron/issues/9311 use bootstore::schemes::v0::NetworkConfig; + use camino::Utf8PathBuf; use iddqd::{ BiHashItem, BiHashMap, TriHashItem, TriHashMap, bi_upcast, tri_upcast, @@ -446,7 +449,7 @@ impl ConnMgr { ); // Inform the main task that accepted connection is established - if let Err(_) = main_tx.try_send(ConnToMainMsg { + if let Err(e) = main_tx.try_send(ConnToMainMsg { task_id: task::id(), msg: ConnToMainMsgInner::Accepted { addr, @@ -455,7 +458,7 @@ impl ConnMgr { }) { // The system is shutting down or we've overloaded the main channel // Just bail from this task - warn!(log, "Failed to send 'accepted' msg to main task"); + warn!(log, "Failed to send 'accepted' msg to main task: {e}"); } else { conn.run().await; } @@ -660,7 +663,7 @@ impl ConnMgr { ); // Inform the main task that the client connection is // established. - if let Err(_) = main_tx.try_send(ConnToMainMsg { + if let Err(e) = main_tx.try_send(ConnToMainMsg { task_id: task::id(), msg: ConnToMainMsgInner::Connected { addr, @@ -669,7 +672,7 @@ impl ConnMgr { }) { // The system is shutting down or we've overloaded the main channel // Just bail from this task - error!(log, "Failed to send 'connected' msg to main task"); + error!(log, "Failed to send 'connected' msg to main task: {e}"); } else { conn.run().await; } From 1ac30a385711468f0da84ae0f5828963d326e16c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 28 Oct 2025 18:22:31 +0000 Subject: [PATCH 18/22] TQ: Integrate protocol with `NodeTask` `NodeTask` now uses the `trust_quorum_protocol::Node` and `trust_quorum_protocol::NodeCtx` to send and receive trust quorum messages. An API to drive this was added to the `NodeTaskHandle`. The majority of code in this PR is tests using the API. A follow up will deal with saving persistent state to a Ledger. --- Cargo.lock | 1 + trust-quorum/Cargo.toml | 2 + .../protocol/src/coordinator_state.rs | 4 + trust-quorum/protocol/src/crypto.rs | 2 +- trust-quorum/protocol/src/lib.rs | 7 +- trust-quorum/src/connection_manager.rs | 45 +- trust-quorum/src/task.rs | 1045 ++++++++++++++++- 7 files changed, 1046 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3848089b025..38813d7bd65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14838,6 +14838,7 @@ dependencies = [ "serde_with", "sha3", "sled-agent-types", + "sled-hardware-types", "slog", "slog-error-chain", "sprockets-tls", diff --git a/trust-quorum/Cargo.toml b/trust-quorum/Cargo.toml index 5f5ad0e88a8..306764dc6f1 100644 --- a/trust-quorum/Cargo.toml +++ b/trust-quorum/Cargo.toml @@ -48,6 +48,8 @@ dropshot.workspace = true omicron-test-utils.workspace = true proptest.workspace = true serde_json.workspace = true +sled-hardware-types.workspace = true test-strategy.workspace = true +trust-quorum-protocol = { workspace = true, features = ["testing"] } trust-quorum-test-utils.workspace = true sprockets-tls-test-utils.workspace = true diff --git a/trust-quorum/protocol/src/coordinator_state.rs b/trust-quorum/protocol/src/coordinator_state.rs index 5d8d394c32c..725cdfb8397 100644 --- a/trust-quorum/protocol/src/coordinator_state.rs +++ b/trust-quorum/protocol/src/coordinator_state.rs @@ -236,6 +236,10 @@ impl CoordinatorState { &self.op } + pub fn config(&self) -> &Configuration { + &self.configuration + } + /// Send any required messages as a reconfiguration coordinator /// /// This varies depending upon the current `CoordinatorState`. diff --git a/trust-quorum/protocol/src/crypto.rs b/trust-quorum/protocol/src/crypto.rs index 8227bdef5b8..84ba89c4691 100644 --- a/trust-quorum/protocol/src/crypto.rs +++ b/trust-quorum/protocol/src/crypto.rs @@ -130,7 +130,7 @@ impl Clone for ReconstructedRackSecret { } } -#[cfg(test)] +#[cfg(any(test, feature = "testing"))] impl PartialEq for ReconstructedRackSecret { fn eq(&self, other: &Self) -> bool { self.expose_secret().ct_eq(other.expose_secret()).into() diff --git a/trust-quorum/protocol/src/lib.rs b/trust-quorum/protocol/src/lib.rs index 0d5c522b2d4..44f0d75379c 100644 --- a/trust-quorum/protocol/src/lib.rs +++ b/trust-quorum/protocol/src/lib.rs @@ -37,13 +37,14 @@ pub use coordinator_state::{ }; pub use rack_secret_loader::{LoadRackSecretError, RackSecretLoaderDiff}; pub use validators::{ - ValidatedLrtqUpgradeMsgDiff, ValidatedReconfigureMsgDiff, + LrtqUpgradeError, ReconfigurationError, ValidatedLrtqUpgradeMsgDiff, + ValidatedReconfigureMsgDiff, }; pub use alarm::Alarm; -pub use crypto::RackSecret; +pub use crypto::{RackSecret, ReconstructedRackSecret}; pub use messages::*; -pub use node::{Node, NodeDiff}; +pub use node::{CommitError, Node, NodeDiff, PrepareAndCommitError}; // public only for docs. pub use node_ctx::NodeHandlerCtx; pub use node_ctx::{NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff}; diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index d2b427fd6eb..bcc6bbe4914 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -5,7 +5,7 @@ //! A mechanism for maintaining a full mesh of trust quorum node connections use crate::established_conn::EstablishedConn; -use trust_quorum_protocol::{BaseboardId, PeerMsg}; +use trust_quorum_protocol::{BaseboardId, Envelope, PeerMsg}; // TODO: Move to this crate // https://github.com/oxidecomputer/omicron/issues/9311 @@ -50,7 +50,6 @@ pub enum AcceptError { /// Messages sent from the main task to the connection managing tasks #[derive(Debug)] pub enum MainToConnMsg { - #[expect(unused)] Msg(WireMsg), } @@ -103,7 +102,6 @@ pub enum ConnToMainMsgInner { addr: SocketAddrV6, peer_id: BaseboardId, }, - #[expect(unused)] Received { from: BaseboardId, msg: PeerMsg, @@ -120,7 +118,6 @@ pub enum ConnToMainMsgInner { pub struct TaskHandle { pub abort_handle: AbortHandle, - #[expect(unused)] pub tx: mpsc::Sender, pub conn_type: ConnectionType, } @@ -137,6 +134,10 @@ impl TaskHandle { pub fn abort(&self) { self.abort_handle.abort() } + + pub async fn send(&self, msg: PeerMsg) { + let _ = self.tx.send(MainToConnMsg::Msg(WireMsg::Tq(msg))).await; + } } impl BiHashItem for TaskHandle { @@ -178,6 +179,10 @@ impl EstablishedTaskHandle { pub fn abort(&self) { self.task_handle.abort(); } + + pub async fn send(&self, msg: PeerMsg) { + let _ = self.task_handle.send(msg).await; + } } impl TriHashItem for EstablishedTaskHandle { @@ -375,6 +380,14 @@ impl ConnMgr { self.listen_addr } + pub async fn send(&self, envelope: Envelope) { + let Envelope { to, msg, .. } = envelope; + info!(self.log, "Sending {msg:?}"; "peer_id" => %to); + if let Some(handle) = self.established.get1(&to) { + handle.send(msg).await; + } + } + /// Perform any polling related operations that the connection /// manager must perform concurrently. pub async fn step( @@ -576,13 +589,15 @@ impl ConnMgr { /// easiest way to achieve this is to only connect to peers with addresses /// that sort less than our own and tear down any connections that no longer /// exist in `addrs`. + /// + /// Return the `BaseboardId` of all peers that have been disconnected. pub async fn update_bootstrap_connections( &mut self, addrs: BTreeSet, corpus: Vec, - ) { + ) -> BTreeSet { if self.bootstrap_addrs == addrs { - return; + return BTreeSet::new(); } // We don't try to compare addresses from accepted nodes. If DDMD @@ -610,9 +625,13 @@ impl ConnMgr { self.connect_client(corpus.clone(), addr).await; } + let mut disconnected_peers = BTreeSet::new(); for addr in to_disconnect { - self.disconnect_client(addr).await; + if let Some(peer_id) = self.disconnect_client(addr).await { + disconnected_peers.insert(peer_id); + } } + disconnected_peers } /// Spawn a task to estalbish a sprockets connection for the given address @@ -691,7 +710,13 @@ impl ConnMgr { /// /// We don't tear down server connections this way as we don't know their /// listen port, just the ephemeral port. - async fn disconnect_client(&mut self, addr: SocketAddrV6) { + /// + /// Return the `BaseboardId` of the peer if an established connection is + // torn down. + async fn disconnect_client( + &mut self, + addr: SocketAddrV6, + ) -> Option { if let Some(handle) = self.connecting.remove2(&addr) { // The connection has not yet completed its handshake info!( @@ -700,6 +725,7 @@ impl ConnMgr { "remote_addr" => %addr ); handle.abort(); + None } else { if let Some(handle) = self.established.remove3(&addr) { info!( @@ -709,6 +735,9 @@ impl ConnMgr { "peer_id" => %handle.baseboard_id ); handle.abort(); + Some(handle.baseboard_id) + } else { + None } } } diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 6e821f197db..0bbae4ac142 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -8,15 +8,29 @@ use crate::connection_manager::{ ConnMgr, ConnMgrStatus, ConnToMainMsg, ConnToMainMsgInner, }; +use omicron_uuid_kinds::RackUuid; +use serde::{Deserialize, Serialize}; use slog::{Logger, debug, error, info, o}; use sprockets_tls::keys::SprocketsConfig; use std::collections::BTreeSet; use std::net::SocketAddrV6; +use std::time::Duration; use thiserror::Error; use tokio::sync::mpsc::error::SendError; use tokio::sync::oneshot::error::RecvError; use tokio::sync::{mpsc, oneshot}; -use trust_quorum_protocol::{BaseboardId, Node, NodeCtx}; +use tokio::time::sleep; +use trust_quorum_protocol::{ + Alarm, BaseboardId, CommitError, Configuration, Epoch, ExpungedMetadata, + LoadRackSecretError, LrtqUpgradeError, LrtqUpgradeMsg, Node, NodeCallerCtx, + NodeCommonCtx, NodeCtx, PersistentState, PrepareAndCommitError, + ReconfigurationError, ReconfigureMsg, ReconstructedRackSecret, +}; + +#[cfg(not(test))] +const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(500); +#[cfg(test)] +const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(5); /// We only expect a handful of messages at a time. const API_CHANNEL_BOUND: usize = 32; @@ -39,6 +53,45 @@ pub struct Config { pub sprockets: SprocketsConfig, } +/// Status of the node coordinating the `Prepare` phase of a reconfiguration or +/// LRTQ upgrade. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CoordinatorStatus { + config: Configuration, + acked_prepares: BTreeSet, +} + +// Details about a given node's status +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeStatus { + connected_peers: BTreeSet, + alarms: BTreeSet, + persistent_state: NodePersistentStateSummary, +} + +/// A summary of a node's persistent state, leaving out things like key shares +/// and hashes. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodePersistentStateSummary { + has_lrtq_share: bool, + configs: BTreeSet, + shares: BTreeSet, + commits: BTreeSet, + expunged: Option, +} + +impl From<&PersistentState> for NodePersistentStateSummary { + fn from(value: &PersistentState) -> Self { + Self { + has_lrtq_share: value.lrtq.is_some(), + configs: value.configs.iter().map(|c| c.epoch).collect(), + shares: value.shares.keys().cloned().collect(), + commits: value.commits.clone(), + expunged: value.expunged.clone(), + } + } +} + /// A request sent to the `NodeTask` from the `NodeTaskHandle` pub enum NodeApiRequest { /// Inform the `Node` of currently known IP addresses on the bootstrap network @@ -46,9 +99,51 @@ pub enum NodeApiRequest { /// These are generated from DDM prefixes learned by the bootstrap agent. BootstrapAddresses(BTreeSet), + /// Remove any secrets cached in memory at this node + ClearSecrets, + /// Retrieve connectivity status via the `ConnMgr` ConnMgrStatus { responder: oneshot::Sender }, + /// Return the status of this node if it is a coordinator + CoordinatorStatus { responder: oneshot::Sender> }, + + /// Load a rack secret for the given epoch + LoadRackSecret { + epoch: Epoch, + responder: oneshot::Sender< + Result, LoadRackSecretError>, + >, + }, + + /// Coordinate an upgrade from LRTQ at this node + LrtqUpgrade { + msg: LrtqUpgradeMsg, + responder: oneshot::Sender>, + }, + + /// Get the overall status of the node + NodeStatus { responder: oneshot::Sender }, + + /// `PrepareAndCommit` a configuration at this node + PrepareAndCommit { + config: Configuration, + responder: oneshot::Sender>, + }, + + /// `Commit` a configuration at this node + Commit { + rack_id: RackUuid, + epoch: Epoch, + responder: oneshot::Sender>, + }, + + /// Coordinate a reconfiguration at this node + Reconfigure { + msg: ReconfigureMsg, + responder: oneshot::Sender>, + }, + /// Shutdown the node's tokio tasks Shutdown, } @@ -56,10 +151,20 @@ pub enum NodeApiRequest { /// An error response from a `NodeApiRequest` #[derive(Error, Debug, PartialEq)] pub enum NodeApiError { - #[error("Failed to send request to node task")] + #[error("failed to send request to node task")] Send, - #[error("Failed to receive response from node task")] + #[error("failed to receive response from node task")] Recv, + #[error("failed to reconfigure trust quorum")] + Reconfigure(#[from] ReconfigurationError), + #[error("failed to load rack secret")] + LoadRackSecret(#[from] LoadRackSecretError), + #[error("failed to upgrade from LRTQ")] + LrtqUpgrade(#[from] LrtqUpgradeError), + #[error("failed to prepare and commit")] + PrepareAndCommit(#[from] PrepareAndCommitError), + #[error("failed to commit")] + Commit(#[from] CommitError), } impl From> for NodeApiError { @@ -82,7 +187,7 @@ pub struct NodeTaskHandle { } impl NodeTaskHandle { - /// Return the actual port being listened on + /// Return the actual ip and port being listened on /// /// This is useful when the port passed in was `0`. pub fn listen_addr(&self) -> SocketAddrV6 { @@ -93,6 +198,114 @@ impl NodeTaskHandle { &self.baseboard_id } + /// Initiate a trust quorum reconfiguration at this node + pub async fn reconfigure( + &self, + msg: ReconfigureMsg, + ) -> Result<(), NodeApiError> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::Reconfigure { msg, responder: tx }) + .await?; + rx.await??; + Ok(()) + } + + /// Initiate an LRTQ upgrade at this node + pub async fn upgrade_from_lrtq( + &self, + msg: LrtqUpgradeMsg, + ) -> Result<(), NodeApiError> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::LrtqUpgrade { msg, responder: tx }) + .await?; + rx.await??; + Ok(()) + } + + /// Return the status of this node if it is coordinating the `Prepare` phase + /// of a reconfiguration or LRTQ upgrade. Return `Ok(None)` or an error + /// otherwise. + pub async fn coordinator_status( + &self, + ) -> Result, NodeApiError> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::CoordinatorStatus { responder: tx }) + .await?; + let res = rx.await?; + Ok(res) + } + + /// Load the rack secret for the given epoch + /// + /// This can block for an indefinite period of time before returning + /// and depends on availability of the trust quorum. + pub async fn load_rack_secret( + &self, + epoch: Epoch, + ) -> Result { + loop { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::LoadRackSecret { epoch, responder: tx }) + .await?; + if let Some(rack_secret) = rx.await?? { + return Ok(rack_secret); + }; + + // The task returns immediately with `None` if the secret is still + // being loaded. We must therefore retry. + sleep(LOAD_RACK_SECRET_RETRY_TIMEOUT).await; + } + } + + /// Return `Ok(true)` if the configuration has committed, `Ok(false)` if + /// it hasn't committed yet, or an error otherwise. + /// + /// Nexus will retry this operation and so we should only try once here. + /// This is in contrast to operations like `load_rack_secret` that are + /// called directly from sled agent. + pub async fn prepare_and_commit( + &self, + config: Configuration, + ) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::PrepareAndCommit { config, responder: tx }) + .await?; + let res = rx.await??; + Ok(res) + } + + /// Return `Ok(true)` if the configuration has committed, `Ok(false)` if + /// it hasn't committed yet, or an error otherwise. + /// + /// Nexus will retry this operation and so we should only try once here. + /// This is in contrast to operations like `load_rack_secret` that are + /// called directly from sled agent. + pub async fn commit( + &self, + rack_id: RackUuid, + epoch: Epoch, + ) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::Commit { rack_id, epoch, responder: tx }) + .await?; + let res = rx.await??; + Ok(res) + } + + /// Clear all secrets loaded in memory at this node + /// + /// Rack secrets are cached after loading and must be manually cleared. + pub async fn clear_secrets(&self) -> Result<(), NodeApiError> { + self.tx.send(NodeApiRequest::ClearSecrets).await?; + Ok(()) + } + /// Inform the node of currently known IP addresses on the bootstrap network /// /// These are generated from DDM prefixes learned by the bootstrap agent. @@ -111,6 +324,13 @@ impl NodeTaskHandle { Ok(res) } + pub async fn status(&self) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx.send(NodeApiRequest::NodeStatus { responder: tx }).await?; + let res = rx.await?; + Ok(res) + } + pub async fn shutdown(&self) -> Result<(), NodeApiError> { self.tx.send(NodeApiRequest::Shutdown).await?; Ok(()) @@ -122,9 +342,7 @@ pub struct NodeTask { log: Logger, #[expect(unused)] config: Config, - #[expect(unused)] node: Node, - #[expect(unused)] ctx: NodeCtx, conn_mgr: ConnMgr, conn_mgr_rx: mpsc::Receiver, @@ -196,7 +414,10 @@ impl NodeTask { Some(msg) = self.conn_mgr_rx.recv() => { self.on_conn_msg(msg).await } + } + for envelope in self.ctx.drain_envelopes() { + self.conn_mgr.send(envelope).await; } } } @@ -207,19 +428,22 @@ impl NodeTask { match msg.msg { ConnToMainMsgInner::Accepted { addr, peer_id } => { self.conn_mgr - .server_handshake_completed(task_id, addr, peer_id) + .server_handshake_completed(task_id, addr, peer_id.clone()) .await; + self.node.on_connect(&mut self.ctx, peer_id); } ConnToMainMsgInner::Connected { addr, peer_id } => { self.conn_mgr - .client_handshake_completed(task_id, addr, peer_id) + .client_handshake_completed(task_id, addr, peer_id.clone()) .await; + self.node.on_connect(&mut self.ctx, peer_id); } ConnToMainMsgInner::Disconnected { peer_id } => { - self.conn_mgr.on_disconnected(task_id, peer_id).await; + self.conn_mgr.on_disconnected(task_id, peer_id.clone()).await; + self.node.on_disconnect(&mut self.ctx, peer_id); } - ConnToMainMsgInner::Received { from: _, msg: _ } => { - todo!(); + ConnToMainMsgInner::Received { from, msg } => { + self.node.handle(&mut self.ctx, from, msg); } ConnToMainMsgInner::ReceivedNetworkConfig { from: _, @@ -230,18 +454,77 @@ impl NodeTask { } } + // TODO: Process `ctx`: save persistent state async fn on_api_request(&mut self, request: NodeApiRequest) { match request { NodeApiRequest::BootstrapAddresses(addrs) => { info!(self.log, "Updated Peer Addresses: {addrs:?}"); // TODO: real corpus let corpus = vec![]; - self.conn_mgr.update_bootstrap_connections(addrs, corpus).await; + let disconnected = self + .conn_mgr + .update_bootstrap_connections(addrs, corpus) + .await; + for peer_id in disconnected { + self.node.on_disconnect(&mut self.ctx, peer_id); + } + } + NodeApiRequest::ClearSecrets => { + self.node.clear_secrets(); + } + NodeApiRequest::Commit { rack_id, epoch, responder } => { + let res = self + .node + .commit_configuration(&mut self.ctx, rack_id, epoch) + .map(|_| { + self.ctx.persistent_state().commits.contains(&epoch) + }); + let _ = responder.send(res); } NodeApiRequest::ConnMgrStatus { responder } => { debug!(self.log, "Received Request for ConnMgrStatus"); let _ = responder.send(self.conn_mgr.status()); } + NodeApiRequest::CoordinatorStatus { responder } => { + let status = self.node.get_coordinator_state().map(|cs| { + CoordinatorStatus { + config: cs.config().clone(), + acked_prepares: cs.op().acked_prepares(), + } + }); + let _ = responder.send(status); + } + NodeApiRequest::LoadRackSecret { epoch, responder } => { + let res = self.node.load_rack_secret(&mut self.ctx, epoch); + let _ = responder.send(res); + } + NodeApiRequest::LrtqUpgrade { msg, responder } => { + let res = + self.node.coordinate_upgrade_from_lrtq(&mut self.ctx, msg); + let _ = responder.send(res); + } + NodeApiRequest::NodeStatus { responder } => { + let _ = responder.send(NodeStatus { + connected_peers: self.ctx.connected().clone(), + alarms: self.ctx.alarms().clone(), + persistent_state: self.ctx.persistent_state().into(), + }); + } + NodeApiRequest::PrepareAndCommit { config, responder } => { + let epoch = config.epoch; + let res = self + .node + .prepare_and_commit(&mut self.ctx, config) + .map(|_| { + self.ctx.persistent_state().commits.contains(&epoch) + }); + let _ = responder.send(res); + } + NodeApiRequest::Reconfigure { msg, responder } => { + let res = + self.node.coordinate_reconfiguration(&mut self.ctx, msg); + let _ = responder.send(res); + } NodeApiRequest::Shutdown => { info!(self.log, "Shutting down Node tokio tasks"); self.shutdown = true; @@ -257,15 +540,20 @@ mod tests { ConnState, RECONNECT_TIME, platform_id_to_baseboard_id, }; use camino::Utf8PathBuf; - use dropshot::test_util::log_prefix_for_test; + use dropshot::test_util::{LogContext, log_prefix_for_test}; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_test_utils::dev::test_setup_log; + use omicron_uuid_kinds::GenericUuid; + use secrecy::ExposeSecretMut; + use sled_hardware_types::Baseboard; use sprockets_tls::keys::ResolveSetting; use sprockets_tls_test_utils::{ alias_prefix, cert_path, certlist_path, private_key_path, root_prefix, sprockets_auth_prefix, }; use std::time::Duration; + use tokio::task::JoinHandle; + use trust_quorum_protocol::NodeHandlerCtx; fn pki_doc_to_node_configs(dir: Utf8PathBuf, n: usize) -> Vec { (1..=n) @@ -304,17 +592,7 @@ mod tests { .collect() } - /// Test that all nodes can connect to each other when given each the full - /// set of "bootstrap addresses". - #[tokio::test] - async fn full_mesh_connectivity() { - let logctx = test_setup_log("full_mesh_connectivity"); - let (mut dir, s) = log_prefix_for_test("full_mesh_connectivity"); - dir.push(&s); - std::fs::create_dir(&dir).unwrap(); - println!("Writing keys and certs to {dir}"); - let num_nodes = 4; - + fn write_keys_and_measurements(dir: Utf8PathBuf, num_nodes: usize) { let file_behavior = sprockets_tls_test_utils::OutputFileExistsBehavior::Overwrite; @@ -339,22 +617,144 @@ mod tests { // Write out the log document to the filesystem let out = attest_mock::log::mock(attest_log_doc).unwrap(); std::fs::write(dir.join("log.bin"), &out).unwrap(); + } - let configs = pki_doc_to_node_configs(dir.clone(), num_nodes); + struct TestSetup { + pub logctx: LogContext, + pub dir: Utf8PathBuf, + pub configs: Vec, + pub node_handles: Vec, + pub join_handles: Vec>, + pub listen_addrs: Vec, + } - let mut node_handles = vec![]; - let mut join_handles = vec![]; - for config in configs.clone() { - let (mut task, handle) = NodeTask::new(config, &logctx.log).await; - node_handles.push(handle); - join_handles.push(tokio::spawn(async move { task.run().await })); + impl TestSetup { + pub async fn spawn_nodes( + name: &'static str, + num_nodes: usize, + ) -> TestSetup { + let logctx = test_setup_log(name); + let (mut dir, s) = log_prefix_for_test(name); + dir.push(&s); + std::fs::create_dir(&dir).unwrap(); + println!("Writing keys and certs to {dir}"); + write_keys_and_measurements(dir.clone(), num_nodes); + let configs = pki_doc_to_node_configs(dir.clone(), num_nodes); + + let mut node_handles = vec![]; + let mut join_handles = vec![]; + for config in configs.clone() { + let (mut task, handle) = + NodeTask::new(config, &logctx.log).await; + node_handles.push(handle); + join_handles + .push(tokio::spawn(async move { task.run().await })); + } + + let listen_addrs: Vec<_> = + node_handles.iter().map(|h| h.listen_addr()).collect(); + TestSetup { + logctx, + dir, + configs, + node_handles, + join_handles, + listen_addrs, + } } - let listen_addrs: BTreeSet<_> = - node_handles.iter().map(|h| h.listen_addr()).collect(); + pub async fn spawn_nodes_with_lrtq_shares( + name: &'static str, + num_nodes: usize, + ) -> (TestSetup, RackUuid) { + let logctx = test_setup_log(name); + let (mut dir, s) = log_prefix_for_test(name); + dir.push(&s); + std::fs::create_dir(&dir).unwrap(); + println!("Writing keys and certs to {dir}"); + write_keys_and_measurements(dir.clone(), num_nodes); + let configs = pki_doc_to_node_configs(dir.clone(), num_nodes); - for h in &node_handles { - h.load_peer_addresses(listen_addrs.clone()).await.unwrap(); + let rack_id = RackUuid::new_v4(); + + // Translate `BaseboardId`s to `Baseboard`s for LRTQ membership + let baseboards: BTreeSet<_> = configs + .iter() + .map(|c| { + Baseboard::new_pc( + c.baseboard_id.serial_number.clone(), + c.baseboard_id.part_number.clone(), + ) + }) + .collect(); + + // Create the LRTQ key share packages and take only the common data, + // which is what we use for trust quorum upgrade. + let share_pkgs: Vec<_> = bootstore::schemes::v0::create_pkgs( + rack_id.into_untyped_uuid(), + baseboards.clone(), + ) + .unwrap() + .expose_secret_mut() + .iter() + .map(|pkg| pkg.common.clone()) + .collect(); + + let mut node_handles = vec![]; + let mut join_handles = vec![]; + for (config, share_pkg) in + configs.clone().into_iter().zip(share_pkgs) + { + let (mut task, handle) = + NodeTask::new(config, &logctx.log).await; + task.ctx.update_persistent_state(|ps| { + ps.lrtq = Some(share_pkg); + // We are modifying the persistent state, but not in a way + // we want the test to recognize. + false + }); + node_handles.push(handle); + join_handles + .push(tokio::spawn(async move { task.run().await })); + } + + let listen_addrs: Vec<_> = + node_handles.iter().map(|h| h.listen_addr()).collect(); + ( + TestSetup { + logctx, + dir, + configs, + node_handles, + join_handles, + listen_addrs, + }, + rack_id, + ) + } + + pub fn members(&self) -> impl Iterator { + self.configs.iter().map(|c| &c.baseboard_id) + } + + pub fn cleanup_successful(self) { + self.logctx.cleanup_successful(); + std::fs::remove_dir_all(self.dir).unwrap(); + } + } + + /// Test that all nodes can connect to each other when given each the full + /// set of "bootstrap addresses". + #[tokio::test] + async fn full_mesh_connectivity() { + let num_nodes = 4; + let mut setup = + TestSetup::spawn_nodes("full_mesh_connectivity", num_nodes).await; + + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); } let poll_interval = Duration::from_millis(1); @@ -364,7 +764,7 @@ mod tests { wait_for_condition( async || { let mut count = 0; - for h in &node_handles { + for h in &setup.node_handles { let status = h.conn_mgr_status().await.unwrap(); if status .connections @@ -395,9 +795,9 @@ mod tests { // reconnecting. This should cause the task id counter to start // incrementing at all nodes and for their to be one fewer established // connection. - let h = node_handles.pop().unwrap(); + let h = setup.node_handles.pop().unwrap(); h.shutdown().await.unwrap(); - join_handles.pop().unwrap(); + setup.join_handles.pop().unwrap(); let stopped_addr = h.listen_addr; // Speed up reconnection in the test @@ -407,7 +807,7 @@ mod tests { wait_for_condition( async || { let mut valid = 0; - for h in &node_handles { + for h in &setup.node_handles { let status = h.conn_mgr_status().await.unwrap(); let established_count = status .connections @@ -443,16 +843,19 @@ mod tests { .unwrap(); // Now let's bring back up the old node and ensure full connectivity again - let (mut task, handle) = - NodeTask::new(configs.last().unwrap().clone(), &logctx.log).await; - node_handles.push(handle.clone()); - join_handles.push(tokio::spawn(async move { task.run().await })); + let (mut task, handle) = NodeTask::new( + setup.configs.last().unwrap().clone(), + &setup.logctx.log, + ) + .await; + setup.node_handles.push(handle.clone()); + setup.join_handles.push(tokio::spawn(async move { task.run().await })); // The port likely changed, so we must refresh everyone's set of addresses let listen_addrs: BTreeSet<_> = - node_handles.iter().map(|h| h.listen_addr()).collect(); + setup.node_handles.iter().map(|h| h.listen_addr()).collect(); - for h in &node_handles { + for h in &setup.node_handles { h.load_peer_addresses(listen_addrs.clone()).await.unwrap(); } @@ -460,7 +863,7 @@ mod tests { wait_for_condition( async || { let mut count = 0; - for h in &node_handles { + for h in &setup.node_handles { let status = h.conn_mgr_status().await.unwrap(); if status .connections @@ -483,7 +886,553 @@ mod tests { .await .unwrap(); - logctx.cleanup_successful(); - std::fs::remove_dir_all(dir).unwrap(); + setup.cleanup_successful(); + } + + /// Commit an initial configuration at all nodes + #[tokio::test] + pub async fn tq_initial_config() { + let num_nodes = 4; + let setup = + TestSetup::spawn_nodes("tq_initial_config", num_nodes).await; + let rack_id = RackUuid::new_v4(); + + // Trigger an initial configuration by using the first node as a + // coordinator. We're pretending to be the sled-agent with instruction from + // Nexus here. + let initial_config = ReconfigureMsg { + rack_id, + epoch: Epoch(1), + last_committed_epoch: None, + members: setup.members().cloned().collect(), + threshold: trust_quorum_protocol::Threshold(3), + }; + + // Tell nodes how to reach each other + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); + } + + let coordinator = setup.node_handles.first().unwrap(); + coordinator.reconfigure(initial_config).await.unwrap(); + + let poll_interval = Duration::from_millis(10); + let poll_max = Duration::from_secs(10); + + // Wait for the coordinator to see `PrepareAck`s from all nodes + wait_for_condition( + async || { + let Ok(Some(s)) = coordinator.coordinator_status().await else { + return Err(CondCheckError::<()>::NotYet); + }; + if s.acked_prepares.len() == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Commit at each node + // + // Nexus retries this idempotent command until each node acks. So we + // simulate that here. + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles { + if h.commit(rack_id, Epoch(1)).await.unwrap() { + acked += 1; + } + } + if acked == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now load the rack secret at all nodes + let mut secret = None; + for h in &setup.node_handles { + let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); + if secret.is_none() { + secret = Some(rs.clone()); + } + assert_eq!(&rs, secret.as_ref().unwrap()); + } + + setup.cleanup_successful(); + } + + /// Eventually Commit an initial configuration at all nodes + /// + /// We leave one node out of the bootstrap network info, trigger a commit + /// at the first 3 nodes. Then we go and issue a `PrepareAndCommit` to the last + /// node and ensure it commits. + #[tokio::test] + pub async fn tq_initial_config_prepare_and_commit() { + let num_nodes = 4; + let setup = TestSetup::spawn_nodes( + "tq_initial_config_prepare_and_commit", + num_nodes, + ) + .await; + let rack_id = RackUuid::new_v4(); + + // Trigger an initial configuration by using the first node as a + // coordinator. We're pretending to be the sled-agent with instruction from + // Nexus here. + let initial_config = ReconfigureMsg { + rack_id, + epoch: Epoch(1), + last_committed_epoch: None, + members: setup.members().cloned().collect(), + threshold: trust_quorum_protocol::Threshold(3), + }; + + // Tell all but the last node how to reach each other + for h in &setup.node_handles[0..num_nodes - 1] { + h.load_peer_addresses( + setup + .listen_addrs + .iter() + .take(num_nodes - 1) + .cloned() + .collect(), + ) + .await + .unwrap(); + } + + let coordinator = setup.node_handles.first().unwrap(); + coordinator.reconfigure(initial_config).await.unwrap(); + + let poll_interval = Duration::from_millis(10); + let poll_max = Duration::from_secs(10); + + // Wait for the coordinator to see `PrepareAck`s from all but the last + // node + wait_for_condition( + async || { + let Ok(Some(s)) = coordinator.coordinator_status().await else { + return Err(CondCheckError::<()>::NotYet); + }; + if s.acked_prepares.len() == num_nodes - 1 { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Save the configuration as if we were nexus + let config = + coordinator.coordinator_status().await.unwrap().unwrap().config; + + // Commit at each node + // + // Nexus retries this idempotent command until each node acks. So we + // simulate that here. + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles[0..num_nodes - 1] { + if h.commit(rack_id, Epoch(1)).await.unwrap() { + acked += 1; + } + } + if acked == num_nodes - 1 { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now ensure that the last node still hasn't prepared or committed for + // epoch 1, and isn't connected to any other node. + let status = setup.node_handles.last().unwrap().status().await.unwrap(); + assert!(status.connected_peers.is_empty()); + assert!(status.persistent_state.configs.is_empty()); + assert!(status.persistent_state.shares.is_empty()); + assert!(status.persistent_state.commits.is_empty()); + + // Update connectivity at all nodes + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); + } + + // Now issue a `PrepareAndCommit` to the last node and wait for it to + // commit + wait_for_condition( + async || { + let h = &setup.node_handles.last().unwrap(); + if h.prepare_and_commit(config.clone()).await.unwrap() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // The last node should now have all the info we expect + let status = setup.node_handles.last().unwrap().status().await.unwrap(); + assert_eq!(status.connected_peers.len(), num_nodes - 1); + assert!(status.persistent_state.configs.contains(&Epoch(1))); + assert!(status.persistent_state.shares.contains(&Epoch(1))); + assert!(status.persistent_state.commits.contains(&Epoch(1))); + + // Now load the rack secret at all nodes + let mut secret = None; + for h in &setup.node_handles { + let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); + if secret.is_none() { + secret = Some(rs.clone()); + } + assert_eq!(&rs, secret.as_ref().unwrap()); + } + + setup.cleanup_successful(); + } + + /// Perform an initial config, followed by a reconfiguration. Leave one + /// node out of the reconfiguration, then connect it and attempt to load + /// the configuration for the prior epoch. This should result in commit + /// advancing to the latest epoch. + #[tokio::test] + pub async fn tq_reconfig_with_commit_advance() { + let num_nodes = 4; + let setup = TestSetup::spawn_nodes( + "tq_recofnig_with_commit_advance", + num_nodes, + ) + .await; + let rack_id = RackUuid::new_v4(); + + // Trigger an initial configuration by using the first node as a + // coordinator. We're pretending to be the sled-agent with instruction from + // Nexus here. + let initial_config = ReconfigureMsg { + rack_id, + epoch: Epoch(1), + last_committed_epoch: None, + members: setup.members().cloned().collect(), + threshold: trust_quorum_protocol::Threshold(3), + }; + + // Tell all but the last node how to reach each other + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); + } + + let coordinator = setup.node_handles.first().unwrap(); + coordinator.reconfigure(initial_config.clone()).await.unwrap(); + + let poll_interval = Duration::from_millis(10); + let poll_max = Duration::from_secs(10); + + // Wait for the coordinator to see `PrepareAck`s from all nodes + wait_for_condition( + async || { + let Ok(Some(s)) = coordinator.coordinator_status().await else { + return Err(CondCheckError::<()>::NotYet); + }; + if s.acked_prepares.len() == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Commit at each node + // + // Nexus retries this idempotent command until each node acks. So we + // simulate that here. + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles { + if h.commit(rack_id, Epoch(1)).await.unwrap() { + acked += 1; + } + } + if acked == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now load the rack secret at all nodes + let mut secret = None; + for h in &setup.node_handles { + let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); + if secret.is_none() { + secret = Some(rs.clone()); + } + assert_eq!(&rs, secret.as_ref().unwrap()); + } + + // Tell all but the last node how to reach each other + // This should disconnect the last node from everybody + for h in &setup.node_handles[0..num_nodes - 1] { + h.load_peer_addresses( + setup.listen_addrs.iter().take(3).cloned().collect(), + ) + .await + .unwrap(); + } + setup + .node_handles + .last() + .unwrap() + .load_peer_addresses(BTreeSet::new()) + .await + .unwrap(); + + // Wait for peers to disconnect + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles[0..num_nodes - 1] { + let status = h.status().await.unwrap(); + if status.connected_peers.len() == num_nodes - 2 { + acked += 1; + } + } + let status = + setup.node_handles.last().unwrap().status().await.unwrap(); + if status.connected_peers.is_empty() { + acked += 1; + } + + if acked == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Just stick to the same set of nodes for simplicity + let mut new_config = initial_config; + new_config.epoch = Epoch(2); + new_config.last_committed_epoch = Some(Epoch(1)); + + // Pick a different coordinator for the hell of it + let coordinator = setup.node_handles.get(1).unwrap(); + coordinator.reconfigure(new_config).await.unwrap(); + + // Wait for the coordinator to see `PrepareAck`s from all but the last + // node + wait_for_condition( + async || { + let Ok(Some(s)) = coordinator.coordinator_status().await else { + return Err(CondCheckError::<()>::NotYet); + }; + if s.acked_prepares.len() == num_nodes - 1 { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Commit at each node + // + // Nexus retries this idempotent command until each node acks. So we + // simulate that here. + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles[0..num_nodes - 1] { + if h.commit(rack_id, Epoch(2)).await.unwrap() { + acked += 1; + } + } + if acked == num_nodes - 1 { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now ensure that the last node still hasn't prepared or committed for epoch 2, + // and isn't connected to any other node. + let status = setup.node_handles.last().unwrap().status().await.unwrap(); + assert!(status.connected_peers.is_empty()); + assert!(status.persistent_state.configs.contains(&Epoch(1))); + assert!(status.persistent_state.shares.contains(&Epoch(1))); + assert!(status.persistent_state.commits.contains(&Epoch(1))); + assert!(!status.persistent_state.configs.contains(&Epoch(2))); + assert!(!status.persistent_state.shares.contains(&Epoch(2))); + assert!(!status.persistent_state.commits.contains(&Epoch(2))); + + // Now reconnect the last node. + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); + } + + // Clear the rack secrets at the last node to force a request for shares. + let last_node = setup.node_handles.last().unwrap(); + last_node.clear_secrets().await.unwrap(); + + // Load the secret at epoch 1. This should trigger a `CommitAdvance` + // response from nodes that committed at epoch 2. + let res = + setup.node_handles.last().unwrap().load_rack_secret(Epoch(1)).await; + + println!("res = {res:#?}"); + + let rs = last_node.load_rack_secret(Epoch(2)).await.unwrap(); + + // Ensure the rack secret is the same as at another node + let expected = setup + .node_handles + .first() + .unwrap() + .load_rack_secret(Epoch(2)) + .await + .unwrap(); + assert_eq!(rs, expected); + + setup.cleanup_successful(); + } + + #[tokio::test] + pub async fn tq_upgrade_from_lrtq() { + let num_nodes = 4; + let (setup, rack_id) = TestSetup::spawn_nodes_with_lrtq_shares( + "tq_upgrade_from_lrtq", + num_nodes, + ) + .await; + + let msg = LrtqUpgradeMsg { + rack_id, + epoch: Epoch(2), + members: setup.members().cloned().collect(), + threshold: trust_quorum_protocol::Threshold(3), + }; + + // Tell nodes how to reach each other + for h in &setup.node_handles { + h.load_peer_addresses(setup.listen_addrs.iter().cloned().collect()) + .await + .unwrap(); + } + + let coordinator = setup.node_handles.first().unwrap(); + coordinator.upgrade_from_lrtq(msg).await.unwrap(); + + let poll_interval = Duration::from_millis(10); + let poll_max = Duration::from_secs(10); + + // Wait for the coordinator to see `PrepareAck`s from all nodes + wait_for_condition( + async || { + let Ok(Some(s)) = coordinator.coordinator_status().await else { + return Err(CondCheckError::<()>::NotYet); + }; + if s.acked_prepares.len() == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Commit at each node + // + // Nexus retries this idempotent command until each node acks. So we + // simulate that here. + wait_for_condition( + async || { + let mut acked = 0; + for h in &setup.node_handles { + if h.commit(rack_id, Epoch(2)).await.unwrap() { + acked += 1; + } + } + if acked == num_nodes { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &poll_interval, + &poll_max, + ) + .await + .unwrap(); + + // Now load the rack secret at all nodes + let mut secret = None; + for h in &setup.node_handles { + let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); + if secret.is_none() { + secret = Some(rs.clone()); + } + assert_eq!(&rs, secret.as_ref().unwrap()); + } + + setup.cleanup_successful(); } } From aaad0c4804b4179263dc71657411ecaed7866e78 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 31 Oct 2025 19:34:19 +0000 Subject: [PATCH 19/22] Do not internally retry for `load_rack_secret` --- trust-quorum/src/task.rs | 146 +++++++++++++++++++++------------------ 1 file changed, 79 insertions(+), 67 deletions(-) diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 0bbae4ac142..ee547c39ff3 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -14,12 +14,10 @@ use slog::{Logger, debug, error, info, o}; use sprockets_tls::keys::SprocketsConfig; use std::collections::BTreeSet; use std::net::SocketAddrV6; -use std::time::Duration; use thiserror::Error; use tokio::sync::mpsc::error::SendError; use tokio::sync::oneshot::error::RecvError; use tokio::sync::{mpsc, oneshot}; -use tokio::time::sleep; use trust_quorum_protocol::{ Alarm, BaseboardId, CommitError, Configuration, Epoch, ExpungedMetadata, LoadRackSecretError, LrtqUpgradeError, LrtqUpgradeMsg, Node, NodeCallerCtx, @@ -27,11 +25,6 @@ use trust_quorum_protocol::{ ReconfigurationError, ReconfigureMsg, ReconstructedRackSecret, }; -#[cfg(not(test))] -const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(500); -#[cfg(test)] -const LOAD_RACK_SECRET_RETRY_TIMEOUT: Duration = Duration::from_millis(5); - /// We only expect a handful of messages at a time. const API_CHANNEL_BOUND: usize = 32; @@ -245,20 +238,13 @@ impl NodeTaskHandle { pub async fn load_rack_secret( &self, epoch: Epoch, - ) -> Result { - loop { - let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::LoadRackSecret { epoch, responder: tx }) - .await?; - if let Some(rack_secret) = rx.await?? { - return Ok(rack_secret); - }; - - // The task returns immediately with `None` if the secret is still - // being loaded. We must therefore retry. - sleep(LOAD_RACK_SECRET_RETRY_TIMEOUT).await; - } + ) -> Result, NodeApiError> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(NodeApiRequest::LoadRackSecret { epoch, responder: tx }) + .await?; + let rs = rx.await??; + Ok(rs) } /// Return `Ok(true)` if the configuration has committed, `Ok(false)` if @@ -542,7 +528,7 @@ mod tests { use camino::Utf8PathBuf; use dropshot::test_util::{LogContext, log_prefix_for_test}; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; - use omicron_test_utils::dev::test_setup_log; + use omicron_test_utils::dev::{self, test_setup_log}; use omicron_uuid_kinds::GenericUuid; use secrecy::ExposeSecretMut; use sled_hardware_types::Baseboard; @@ -741,6 +727,36 @@ mod tests { self.logctx.cleanup_successful(); std::fs::remove_dir_all(self.dir).unwrap(); } + + pub async fn wait_for_rack_secrets_and_assert_equality( + &self, + node_indexes: BTreeSet, + epoch: Epoch, + ) -> Result<(), dev::poll::Error> { + let poll_interval = Duration::from_millis(10); + let poll_max = Duration::from_secs(10); + wait_for_condition( + async || { + let mut secret = None; + for (i, h) in self.node_handles.iter().enumerate() { + if node_indexes.contains(&i) { + let Some(rs) = h.load_rack_secret(epoch).await? + else { + return Err(CondCheckError::NotYet); + }; + if secret.is_none() { + secret = Some(rs.clone()); + } + assert_eq!(&rs, secret.as_ref().unwrap()); + } + } + Ok(()) + }, + &poll_interval, + &poll_max, + ) + .await + } } /// Test that all nodes can connect to each other when given each the full @@ -964,14 +980,13 @@ mod tests { .unwrap(); // Now load the rack secret at all nodes - let mut secret = None; - for h in &setup.node_handles { - let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); - if secret.is_none() { - secret = Some(rs.clone()); - } - assert_eq!(&rs, secret.as_ref().unwrap()); - } + setup + .wait_for_rack_secrets_and_assert_equality( + (0..num_nodes).collect(), + Epoch(1), + ) + .await + .unwrap(); setup.cleanup_successful(); } @@ -1109,14 +1124,13 @@ mod tests { assert!(status.persistent_state.commits.contains(&Epoch(1))); // Now load the rack secret at all nodes - let mut secret = None; - for h in &setup.node_handles { - let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); - if secret.is_none() { - secret = Some(rs.clone()); - } - assert_eq!(&rs, secret.as_ref().unwrap()); - } + setup + .wait_for_rack_secrets_and_assert_equality( + (0..num_nodes).collect(), + Epoch(1), + ) + .await + .unwrap(); setup.cleanup_successful(); } @@ -1202,14 +1216,13 @@ mod tests { .unwrap(); // Now load the rack secret at all nodes - let mut secret = None; - for h in &setup.node_handles { - let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); - if secret.is_none() { - secret = Some(rs.clone()); - } - assert_eq!(&rs, secret.as_ref().unwrap()); - } + setup + .wait_for_rack_secrets_and_assert_equality( + (0..num_nodes).collect(), + Epoch(1), + ) + .await + .unwrap(); // Tell all but the last node how to reach each other // This should disconnect the last node from everybody @@ -1332,22 +1345,22 @@ mod tests { // Load the secret at epoch 1. This should trigger a `CommitAdvance` // response from nodes that committed at epoch 2. - let res = - setup.node_handles.last().unwrap().load_rack_secret(Epoch(1)).await; - - println!("res = {res:#?}"); - - let rs = last_node.load_rack_secret(Epoch(2)).await.unwrap(); + setup + .wait_for_rack_secrets_and_assert_equality( + BTreeSet::from([num_nodes - 1]), + Epoch(1), + ) + .await + .unwrap(); - // Ensure the rack secret is the same as at another node - let expected = setup - .node_handles - .first() - .unwrap() - .load_rack_secret(Epoch(2)) + // Ensure the rack secret at epoch 2 is the same as at another node + setup + .wait_for_rack_secrets_and_assert_equality( + BTreeSet::from([0, num_nodes - 1]), + Epoch(2), + ) .await .unwrap(); - assert_eq!(rs, expected); setup.cleanup_successful(); } @@ -1424,14 +1437,13 @@ mod tests { .unwrap(); // Now load the rack secret at all nodes - let mut secret = None; - for h in &setup.node_handles { - let rs = h.load_rack_secret(Epoch(1)).await.unwrap(); - if secret.is_none() { - secret = Some(rs.clone()); - } - assert_eq!(&rs, secret.as_ref().unwrap()); - } + setup + .wait_for_rack_secrets_and_assert_equality( + (0..num_nodes).collect(), + Epoch(1), + ) + .await + .unwrap(); setup.cleanup_successful(); } From 7beb55d433334ee09d1f4cf066b7fe21688f9183 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 31 Oct 2025 19:56:12 +0000 Subject: [PATCH 20/22] more review fixes for @sunshowers --- trust-quorum/src/task.rs | 131 +++++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 52 deletions(-) diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index ee547c39ff3..24962eb6251 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -25,6 +25,12 @@ use trust_quorum_protocol::{ ReconfigurationError, ReconfigureMsg, ReconstructedRackSecret, }; +/// Whether or not a configuration has committed or is still underway. +pub enum CommitStatus { + Committed, + Pending, +} + /// We only expect a handful of messages at a time. const API_CHANNEL_BOUND: usize = 32; @@ -96,15 +102,15 @@ pub enum NodeApiRequest { ClearSecrets, /// Retrieve connectivity status via the `ConnMgr` - ConnMgrStatus { responder: oneshot::Sender }, + ConnMgrStatus { tx: oneshot::Sender }, /// Return the status of this node if it is a coordinator - CoordinatorStatus { responder: oneshot::Sender> }, + CoordinatorStatus { tx: oneshot::Sender> }, /// Load a rack secret for the given epoch LoadRackSecret { epoch: Epoch, - responder: oneshot::Sender< + tx: oneshot::Sender< Result, LoadRackSecretError>, >, }, @@ -112,29 +118,29 @@ pub enum NodeApiRequest { /// Coordinate an upgrade from LRTQ at this node LrtqUpgrade { msg: LrtqUpgradeMsg, - responder: oneshot::Sender>, + tx: oneshot::Sender>, }, /// Get the overall status of the node - NodeStatus { responder: oneshot::Sender }, + NodeStatus { tx: oneshot::Sender }, /// `PrepareAndCommit` a configuration at this node PrepareAndCommit { config: Configuration, - responder: oneshot::Sender>, + tx: oneshot::Sender>, }, /// `Commit` a configuration at this node Commit { rack_id: RackUuid, epoch: Epoch, - responder: oneshot::Sender>, + tx: oneshot::Sender>, }, /// Coordinate a reconfiguration at this node Reconfigure { msg: ReconfigureMsg, - responder: oneshot::Sender>, + tx: oneshot::Sender>, }, /// Shutdown the node's tokio tasks @@ -197,9 +203,7 @@ impl NodeTaskHandle { msg: ReconfigureMsg, ) -> Result<(), NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::Reconfigure { msg, responder: tx }) - .await?; + self.tx.send(NodeApiRequest::Reconfigure { msg, tx: tx }).await?; rx.await??; Ok(()) } @@ -210,9 +214,7 @@ impl NodeTaskHandle { msg: LrtqUpgradeMsg, ) -> Result<(), NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::LrtqUpgrade { msg, responder: tx }) - .await?; + self.tx.send(NodeApiRequest::LrtqUpgrade { msg, tx: tx }).await?; rx.await??; Ok(()) } @@ -224,9 +226,7 @@ impl NodeTaskHandle { &self, ) -> Result, NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::CoordinatorStatus { responder: tx }) - .await?; + self.tx.send(NodeApiRequest::CoordinatorStatus { tx: tx }).await?; let res = rx.await?; Ok(res) } @@ -240,9 +240,7 @@ impl NodeTaskHandle { epoch: Epoch, ) -> Result, NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::LoadRackSecret { epoch, responder: tx }) - .await?; + self.tx.send(NodeApiRequest::LoadRackSecret { epoch, tx: tx }).await?; let rs = rx.await??; Ok(rs) } @@ -256,10 +254,10 @@ impl NodeTaskHandle { pub async fn prepare_and_commit( &self, config: Configuration, - ) -> Result { + ) -> Result { let (tx, rx) = oneshot::channel(); self.tx - .send(NodeApiRequest::PrepareAndCommit { config, responder: tx }) + .send(NodeApiRequest::PrepareAndCommit { config, tx: tx }) .await?; let res = rx.await??; Ok(res) @@ -275,11 +273,9 @@ impl NodeTaskHandle { &self, rack_id: RackUuid, epoch: Epoch, - ) -> Result { + ) -> Result { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::Commit { rack_id, epoch, responder: tx }) - .await?; + self.tx.send(NodeApiRequest::Commit { rack_id, epoch, tx: tx }).await?; let res = rx.await??; Ok(res) } @@ -303,20 +299,23 @@ impl NodeTaskHandle { Ok(()) } + /// Return information about connectivity to other peers pub async fn conn_mgr_status(&self) -> Result { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::ConnMgrStatus { responder: tx }).await?; + self.tx.send(NodeApiRequest::ConnMgrStatus { tx: tx }).await?; let res = rx.await?; Ok(res) } + /// Return internal information for the [`Node`] pub async fn status(&self) -> Result { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::NodeStatus { responder: tx }).await?; + self.tx.send(NodeApiRequest::NodeStatus { tx: tx }).await?; let res = rx.await?; Ok(res) } + /// Shutdown this [`NodeTask`] and all its child tasks pub async fn shutdown(&self) -> Result<(), NodeApiError> { self.tx.send(NodeApiRequest::Shutdown).await?; Ok(()) @@ -458,58 +457,68 @@ impl NodeTask { NodeApiRequest::ClearSecrets => { self.node.clear_secrets(); } - NodeApiRequest::Commit { rack_id, epoch, responder } => { + NodeApiRequest::Commit { rack_id, epoch, tx } => { let res = self .node .commit_configuration(&mut self.ctx, rack_id, epoch) .map(|_| { - self.ctx.persistent_state().commits.contains(&epoch) + if self.ctx.persistent_state().commits.contains(&epoch) + { + CommitStatus::Committed + } else { + CommitStatus::Pending + } }); - let _ = responder.send(res); + let _ = tx.send(res); } - NodeApiRequest::ConnMgrStatus { responder } => { + NodeApiRequest::ConnMgrStatus { tx } => { debug!(self.log, "Received Request for ConnMgrStatus"); - let _ = responder.send(self.conn_mgr.status()); + let _ = tx.send(self.conn_mgr.status()); } - NodeApiRequest::CoordinatorStatus { responder } => { + NodeApiRequest::CoordinatorStatus { tx } => { let status = self.node.get_coordinator_state().map(|cs| { CoordinatorStatus { config: cs.config().clone(), acked_prepares: cs.op().acked_prepares(), } }); - let _ = responder.send(status); + let _ = tx.send(status); } - NodeApiRequest::LoadRackSecret { epoch, responder } => { + NodeApiRequest::LoadRackSecret { epoch, tx } => { let res = self.node.load_rack_secret(&mut self.ctx, epoch); - let _ = responder.send(res); + let _ = tx.send(res); } - NodeApiRequest::LrtqUpgrade { msg, responder } => { + NodeApiRequest::LrtqUpgrade { msg, tx } => { let res = self.node.coordinate_upgrade_from_lrtq(&mut self.ctx, msg); - let _ = responder.send(res); + let _ = tx.send(res); } - NodeApiRequest::NodeStatus { responder } => { - let _ = responder.send(NodeStatus { + NodeApiRequest::NodeStatus { tx } => { + let _ = tx.send(NodeStatus { connected_peers: self.ctx.connected().clone(), alarms: self.ctx.alarms().clone(), persistent_state: self.ctx.persistent_state().into(), }); } - NodeApiRequest::PrepareAndCommit { config, responder } => { + NodeApiRequest::PrepareAndCommit { config, tx } => { let epoch = config.epoch; let res = self .node .prepare_and_commit(&mut self.ctx, config) .map(|_| { - self.ctx.persistent_state().commits.contains(&epoch) + if self.ctx.persistent_state().commits.contains(&epoch) + { + CommitStatus::Committed + } else { + CommitStatus::Pending + } }); - let _ = responder.send(res); + let _ = tx.send(res); } - NodeApiRequest::Reconfigure { msg, responder } => { + NodeApiRequest::Reconfigure { msg, tx } => { let res = self.node.coordinate_reconfiguration(&mut self.ctx, msg); - let _ = responder.send(res); + let _ = tx.send(res); } NodeApiRequest::Shutdown => { info!(self.log, "Shutting down Node tokio tasks"); @@ -963,7 +972,10 @@ mod tests { async || { let mut acked = 0; for h in &setup.node_handles { - if h.commit(rack_id, Epoch(1)).await.unwrap() { + if matches!( + h.commit(rack_id, Epoch(1)).await.unwrap(), + CommitStatus::Committed + ) { acked += 1; } } @@ -1068,7 +1080,10 @@ mod tests { async || { let mut acked = 0; for h in &setup.node_handles[0..num_nodes - 1] { - if h.commit(rack_id, Epoch(1)).await.unwrap() { + if matches!( + h.commit(rack_id, Epoch(1)).await.unwrap(), + CommitStatus::Committed, + ) { acked += 1; } } @@ -1104,7 +1119,10 @@ mod tests { wait_for_condition( async || { let h = &setup.node_handles.last().unwrap(); - if h.prepare_and_commit(config.clone()).await.unwrap() { + if matches!( + h.prepare_and_commit(config.clone()).await.unwrap(), + CommitStatus::Committed + ) { Ok(()) } else { Err(CondCheckError::<()>::NotYet) @@ -1199,7 +1217,10 @@ mod tests { async || { let mut acked = 0; for h in &setup.node_handles { - if h.commit(rack_id, Epoch(1)).await.unwrap() { + if matches!( + h.commit(rack_id, Epoch(1)).await.unwrap(), + CommitStatus::Committed + ) { acked += 1; } } @@ -1305,7 +1326,10 @@ mod tests { async || { let mut acked = 0; for h in &setup.node_handles[0..num_nodes - 1] { - if h.commit(rack_id, Epoch(2)).await.unwrap() { + if matches!( + h.commit(rack_id, Epoch(2)).await.unwrap(), + CommitStatus::Committed + ) { acked += 1; } } @@ -1420,7 +1444,10 @@ mod tests { async || { let mut acked = 0; for h in &setup.node_handles { - if h.commit(rack_id, Epoch(2)).await.unwrap() { + if matches!( + h.commit(rack_id, Epoch(2)).await.unwrap(), + CommitStatus::Committed + ) { acked += 1; } } From 983a453cb5f6dbf90c6e39425211797d4b2961d7 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 31 Oct 2025 20:20:56 +0000 Subject: [PATCH 21/22] fix typo --- trust-quorum/src/connection_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trust-quorum/src/connection_manager.rs b/trust-quorum/src/connection_manager.rs index bcc6bbe4914..5adcc44d747 100644 --- a/trust-quorum/src/connection_manager.rs +++ b/trust-quorum/src/connection_manager.rs @@ -712,7 +712,7 @@ impl ConnMgr { /// listen port, just the ephemeral port. /// /// Return the `BaseboardId` of the peer if an established connection is - // torn down. + /// torn down. async fn disconnect_client( &mut self, addr: SocketAddrV6, From 4b86d876bc68a1a4987c9565f5d7cee9e0516409 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 31 Oct 2025 20:30:18 +0000 Subject: [PATCH 22/22] This is what I get for doing a quick search/replace --- trust-quorum/src/task.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 24962eb6251..179caf26a22 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -203,7 +203,7 @@ impl NodeTaskHandle { msg: ReconfigureMsg, ) -> Result<(), NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::Reconfigure { msg, tx: tx }).await?; + self.tx.send(NodeApiRequest::Reconfigure { msg, tx }).await?; rx.await??; Ok(()) } @@ -214,7 +214,7 @@ impl NodeTaskHandle { msg: LrtqUpgradeMsg, ) -> Result<(), NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::LrtqUpgrade { msg, tx: tx }).await?; + self.tx.send(NodeApiRequest::LrtqUpgrade { msg, tx }).await?; rx.await??; Ok(()) } @@ -226,7 +226,7 @@ impl NodeTaskHandle { &self, ) -> Result, NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::CoordinatorStatus { tx: tx }).await?; + self.tx.send(NodeApiRequest::CoordinatorStatus { tx }).await?; let res = rx.await?; Ok(res) } @@ -240,7 +240,7 @@ impl NodeTaskHandle { epoch: Epoch, ) -> Result, NodeApiError> { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::LoadRackSecret { epoch, tx: tx }).await?; + self.tx.send(NodeApiRequest::LoadRackSecret { epoch, tx }).await?; let rs = rx.await??; Ok(rs) } @@ -256,9 +256,7 @@ impl NodeTaskHandle { config: Configuration, ) -> Result { let (tx, rx) = oneshot::channel(); - self.tx - .send(NodeApiRequest::PrepareAndCommit { config, tx: tx }) - .await?; + self.tx.send(NodeApiRequest::PrepareAndCommit { config, tx }).await?; let res = rx.await??; Ok(res) } @@ -275,7 +273,7 @@ impl NodeTaskHandle { epoch: Epoch, ) -> Result { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::Commit { rack_id, epoch, tx: tx }).await?; + self.tx.send(NodeApiRequest::Commit { rack_id, epoch, tx }).await?; let res = rx.await??; Ok(res) } @@ -302,7 +300,7 @@ impl NodeTaskHandle { /// Return information about connectivity to other peers pub async fn conn_mgr_status(&self) -> Result { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::ConnMgrStatus { tx: tx }).await?; + self.tx.send(NodeApiRequest::ConnMgrStatus { tx }).await?; let res = rx.await?; Ok(res) } @@ -310,7 +308,7 @@ impl NodeTaskHandle { /// Return internal information for the [`Node`] pub async fn status(&self) -> Result { let (tx, rx) = oneshot::channel(); - self.tx.send(NodeApiRequest::NodeStatus { tx: tx }).await?; + self.tx.send(NodeApiRequest::NodeStatus { tx }).await?; let res = rx.await?; Ok(res) }