Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ hyper = "1.6.0"
hyper-util = "0.1.16"
hyper-rustls = "0.27.7"
hyper-staticfile = "0.10.1"
iddqd = { version = "0.3.13", features = ["daft", "serde", "schemars08"] }
iddqd = { version = "0.3.16", features = ["daft", "serde", "schemars08"] }
id-map = { path = "id-map" }
illumos-utils = { path = "illumos-utils" }
iana-time-zone = "0.1.63"
Expand Down
1 change: 1 addition & 0 deletions trust-quorum/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ camino.workspace = true
chacha20poly1305.workspace = true
ciborium.workspace = true
daft.workspace = true
debug-ignore.workspace = true
derive_more.workspace = true
futures.workspace = true
gfss.workspace = true
Expand Down
24 changes: 22 additions & 2 deletions trust-quorum/protocol/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ use crate::{
use daft::{Diffable, Leaf};
use gfss::shamir::Share;
use omicron_uuid_kinds::RackUuid;
use serde::{Deserialize, Serialize};
use slog::{Logger, error, info, o, warn};
use slog_error_chain::SlogInlineError;

/// An entity capable of participating in trust quorum
///
Expand Down Expand Up @@ -1063,7 +1065,16 @@ impl Node {
}
}

#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
#[derive(
Debug,
Clone,
thiserror::Error,
PartialEq,
Eq,
SlogInlineError,
Serialize,
Deserialize,
)]
pub enum CommitError {
#[error("invalid rack id")]
InvalidRackId(
Expand All @@ -1077,7 +1088,16 @@ pub enum CommitError {
Expunged { epoch: Epoch, from: BaseboardId },
}

#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
#[derive(
Debug,
Clone,
thiserror::Error,
PartialEq,
Eq,
SlogInlineError,
Serialize,
Deserialize,
)]
pub enum PrepareAndCommitError {
#[error("invalid rack id")]
InvalidRackId(
Expand Down
5 changes: 4 additions & 1 deletion trust-quorum/protocol/src/validators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::{
};
use daft::{BTreeSetDiff, Diffable, Leaf};
use omicron_uuid_kinds::RackUuid;
use serde::{Deserialize, Serialize};
use slog::{Logger, error, info, warn};
use std::collections::BTreeSet;

Expand Down Expand Up @@ -57,7 +58,9 @@ pub struct SledExpungedError {
last_prepared_epoch: Option<Epoch>,
}

#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
#[derive(
Debug, Clone, thiserror::Error, PartialEq, Eq, Serialize, Deserialize,
)]
#[error("mismatched rack id: expected {expected:?}, got {got:?}")]
pub struct MismatchedRackIdError {
pub expected: RackUuid,
Expand Down
76 changes: 63 additions & 13 deletions trust-quorum/src/connection_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
//! A mechanism for maintaining a full mesh of trust quorum node connections

use crate::established_conn::EstablishedConn;
use crate::proxy;
use trust_quorum_protocol::{BaseboardId, Envelope, PeerMsg};

// TODO: Move to this crate
// https://github.com/oxidecomputer/omicron/issues/9311
use bootstore::schemes::v0::NetworkConfig;

use camino::Utf8PathBuf;
use derive_more::From;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woah 👀

use iddqd::{
BiHashItem, BiHashMap, TriHashItem, TriHashMap, bi_upcast, tri_upcast,
};
Expand Down Expand Up @@ -60,7 +62,7 @@ pub enum MainToConnMsg {
///
/// All `WireMsg`s sent between nodes is prefixed with a 4 byte size header used
/// for framing.
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize, From)]
pub enum WireMsg {
/// Used for connection keep alive
Ping,
Expand All @@ -79,6 +81,12 @@ pub enum WireMsg {
/// of tiny information layered on top of trust quorum. You can still think
/// of it as a bootstore, although, we no longer use that name.
NetworkConfig(NetworkConfig),

/// Requests proxied to other nodes
ProxyRequest(proxy::WireRequest),

/// Responses to proxy requests
ProxyResponse(proxy::WireResponse),
}

/// Messages sent from connection managing tasks to the main peer task
Expand All @@ -99,6 +107,8 @@ pub enum ConnToMainMsgInner {
Received { from: BaseboardId, msg: PeerMsg },
ReceivedNetworkConfig { from: BaseboardId, config: NetworkConfig },
Disconnected { peer_id: BaseboardId },
ProxyRequestReceived { from: BaseboardId, req: proxy::WireRequest },
ProxyResponseReceived { from: BaseboardId, rsp: proxy::WireResponse },
}

pub struct TaskHandle {
Expand All @@ -120,15 +130,11 @@ impl TaskHandle {
self.abort_handle.abort()
}

pub async fn send(&self, msg: PeerMsg) {
let _ = self.tx.send(MainToConnMsg::Msg(WireMsg::Tq(msg))).await;
}

pub async fn send_network_config(&self, config: NetworkConfig) {
let _ = self
.tx
.send(MainToConnMsg::Msg(WireMsg::NetworkConfig(config)))
.await;
pub async fn send<T>(&self, msg: T)
where
T: Into<WireMsg>,
{
let _ = self.tx.send(MainToConnMsg::Msg(msg.into())).await;
}
}

Expand Down Expand Up @@ -172,7 +178,10 @@ impl EstablishedTaskHandle {
self.task_handle.abort();
}

pub async fn send(&self, msg: PeerMsg) {
pub async fn send<T>(&self, msg: T)
where
T: Into<WireMsg>,
{
let _ = self.task_handle.send(msg).await;
}
}
Expand Down Expand Up @@ -235,6 +244,12 @@ pub struct ConnMgrStatus {
pub total_tasks_spawned: u64,
}

/// The state of a proxy connection
pub enum ProxyConnState {
Connected,
Disconnected,
}

/// A structure to manage all sprockets connections to peer nodes
///
/// Each sprockets connection runs in its own task which communicates with the
Expand Down Expand Up @@ -399,7 +414,7 @@ impl ConnMgr {
"peer_id" => %h.baseboard_id,
"generation" => network_config.generation
);
h.task_handle.send_network_config(network_config.clone()).await;
h.send(network_config.clone()).await;
}
}

Expand All @@ -415,7 +430,42 @@ impl ConnMgr {
"peer_id" => %h.baseboard_id,
"generation" => network_config.generation
);
h.task_handle.send_network_config(network_config.clone()).await;
h.send(network_config.clone()).await;
}
}

/// Forward an API request to another node
///
/// Return the state of the connection at this point in time so that the
/// [`proxy::Tracker`] can manage the outstanding request on behalf of the
/// user.
pub async fn proxy_request(
&mut self,
destination: &BaseboardId,
req: proxy::WireRequest,
) -> ProxyConnState {
if let Some(h) = self.established.get1(destination) {
info!(self.log, "Sending {req:?}"; "peer_id" => %destination);
h.send(req).await;
ProxyConnState::Connected
Comment on lines +449 to +450
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a previous PR I noted that the send() method ignores all failures and returns a success (well, it doesn't return a Result) even if the connection broke. For the trust quorum protocol that seemed fine, as the protocol is resilient to messages not being delivered.

The proxy APIs defined in another file expect either a response from the server or a Disconnected error before returning: if it receives nothing it will block forever. After spending probably way too much time thinking about failure cases related to this1, if the message is sent to the established connection actor things will eventually be fine: when we returns, the caller will add the request to the tracker, and a disconnection detected by the established connection actor will be eventually relayed to the tracker.

The failure case I still see is when the channel is busy (with 10 pending requests) and the send method silently discards the message. In that case, we return a ProxyConnState::Connected and the request will be added to the tracker, but we will never get a response unless the connection breaks due to another unrelated request failing (since this request never got to the actor).

The code as is could be fine if the caller to any proxy method takes care of adding timeouts everywhere, but this feels like a problem waiting to happen. I'd feel way more comfortable if this returned a ProxyConnState::Busy if sending a message to the channel failed.

Footnotes

  1. I lost count of how many times I rewrote this comment with other failure cases that turns out couldn't happen.

Copy link
Contributor Author

@andrewjstone andrewjstone Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great analysis @pietroalbini. Thank you so much for taking the time to go through this process and understand the code and design. I actually think if this was a problem, that it would also be somewhat of a problem for the normal trust quorum protocol, as some messages from peers are never resent on a timer.

For example: If a coordinator is sending a Prepare msg to a peer and the and the message got dropped before making it to the connection task, it would not be resent unless the channel got disconnected and reconnected for some other reason. Now, that being said the volume of messages is small and this should not happen. And as you point out, there is some built in resilience. If the commit occurred nexus would prepareAndCommit this node or it would get a CommitAdvance message on the next restart (perhaps after an update). But it still could end up as a problem if too many nodes did this and the coordinator couldn't complete the prepare phase. Nexus would try a new configuration at perhaps a different coordinator after some time without prepare completing, but the system may still be overloaded.

With all that being said, I don't actually think what you pointed out is entirely true, and therefore this isn't actually a problem here. However, this analysis is also non-trivial. It almost makes me question whether using connection state for retries instead of timers is the right move. So far, I think it continues to work and has the benefit of not re-sending messages already sent over a reliable stream. Ok, so back to the problem.

The failure case I still see is when the channel is busy (with 10 pending requests) and the send method silently discards the message.

The channel being used in send is an mpsc::bounded channel and blocks on send. The error that is discarded is when the channel itself gets disconnected, presumably because the task exited. In that case the Disconnect callback will eventually fire and all is well.

To help ensure that the disconnect callback occurs when buffers start filling up, there is also a MSG_WRITE_QUEUE_CAPACITY for each established connection that will disconnect if too many `` messages are pulled off the channel and serialized before they can be sent. Somewhat importantly, this channel is sized smaller than the queue, so if the queue is full it means that the TCP connection (or serialization) is too slow to move things along. We get backpressure, and eventually a disconnection that should allow things to clear up on a reconnect. I should cleanup the TODO suggestion there as we actually can't drop messages or we will break things as you point out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears there is actually one place where the ConnToMainMsgInner::Disconnected message doesn't get sent to the NodeTask task when an EstablishedConnection closes. That is when the EstablishedConnection task itself panics. I believe that we use panic = abort in production, and so this isn't actually a problem in practice.

However, this has me wondering if instead I should signal to NodeTask from the ConnectionManager::step method when an EstablishedConnection exits rather than sending a Disconnected message from the task itself. That would cover both the successful exit and panic cases for the EstablishedConnection task.

Unfortunately, I also realized that there is another race condition that may get worse if I do this. If a new connection is accepted for the same peer it will trigger an abort of the old connection. In this case the old disconnect will occur after the new connection is established. That could cause problems for the protocol, and I should gate it the Node::on_disconnect task by looking to see if the task_id matches the current established task ID, as is done for the connection manager on_disconnected callback.

Another option to solve the latter problem is to always reject a new accepted connection for the same peer if one is already established. Eventually the old one will go away, and the remote peer will retry.

I need to think about this a bit more, but will likely add a few small cleanup patches.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, part of Emily's concern is that the request tracker inserted into proxy_tracker here https://github.com/oxidecomputer/omicron/pull/9403/files#r2527260678 is leaked in the case where the channel is disconnected but this method returns ProxyConnState::Connected because the disconnect callback has not fired yet. I think we need to either remove the entry from proxy_tracker in that case or have send indicate whether the task is still there so that we could return ProxyConnState::Disconnected and not insert into proxy_tracker?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, part of Emily's concern is that the request tracker inserted into proxy_tracker here https://github.com/oxidecomputer/omicron/pull/9403/files#r2527260678 is leaked in the case where the channel is disconnected but this method returns ProxyConnState::Connected because the disconnect callback has not fired yet. I think we need to either remove the entry from proxy_tracker in that case or have send indicate whether the task is still there so that we could return ProxyConnState::Disconnected and not insert into proxy_tracker?

That behavior is intentional. There is an inherent TOCTTOU where the message can be put on the channel and then the socket can disconnect. In this case we return the Connected and then get a Disconnected callback sometime later to clear the state. This is also exactly what would happen if the message pulled off the channel, serialized, was sent over the socket, and then the channel disconnected. The key invariant to uphold is: if at any time a message is lost the disconnect callback must fire a short time after. No further messages should be able to be sent over the socket.

What makes this work is that the disconnect callback always fires after the tracked socket is recorded. We know it hasn't fired yet because the EstablishedTaskHandle is still in the main map which is in the same task that is doing the send. Therefore any disconnect will come immediately after the send if the task is gone. If there is no handle in the map then we return disconnected. Note that it's also possible that the connection had happened already but the main task hasn't learned yet. So we can discard even if connected already. TOCTTUOs all around.

} else {
ProxyConnState::Disconnected
}
}

/// Return a response to a proxied request to another node
///
/// There is no need to track whether this succeeds or fails. If the
/// connection goes away the client on the other side will notice it and
/// retry if needed.
pub async fn proxy_response(
&mut self,
destination: &BaseboardId,
rsp: proxy::WireResponse,
) {
if let Some(h) = self.established.get1(destination) {
info!(self.log, "Sending {rsp:?}"; "peer_id" => %destination);
h.send(rsp).await;
}
}

Expand Down
30 changes: 30 additions & 0 deletions trust-quorum/src/established_conn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,36 @@ impl EstablishedConn {
panic!("Connection to main task channnel full");
}
}
WireMsg::ProxyRequest(req) => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: all variants of this enum except for WireMsg::Ping have almost the same body, I wonder whether we could reduce code duplication?

let msg = match msg {
    WireMsg::Ping => continue,
    WireMsg::NetworkConfig => ConnToMainMsgInner::ReceivedNetworkConfig { ... },
};
if let Err(_) = ... {
    ...
}

if let Err(_) = self.main_tx.try_send(ConnToMainMsg {
task_id: self.task_id,
msg: ConnToMainMsgInner::ProxyRequestReceived {
from: self.peer_id.clone(),
req,
},
}) {
error!(
self.log,
"Failed to send received proxy msg to the main task"
);
panic!("Connection to main task channel full");
}
}
WireMsg::ProxyResponse(rsp) => {
if let Err(_) = self.main_tx.try_send(ConnToMainMsg {
task_id: self.task_id,
msg: ConnToMainMsgInner::ProxyResponseReceived {
from: self.peer_id.clone(),
rsp,
},
}) {
error!(
self.log,
"Failed to send received proxy msg to the main task"
);
panic!("Connection to main task channel full");
}
Comment on lines +244 to +264
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, i think that the log line and panic messages here should probably include whether the error indicates that the channel is full or was disconnected because the main task exited. Assuming that try_send returning an error means the channel is full here could probably confuse people while debugging --- we shouldn't say " channel full" in the disconnected case.

}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion trust-quorum/src/ledgers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use slog::{Logger, info};
use trust_quorum_protocol::PersistentState;

/// A wrapper type around [`PersistentState`] for use as a [`Ledger`]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PersistentStateLedger {
pub generation: u64,
pub state: PersistentState,
Expand Down
3 changes: 2 additions & 1 deletion trust-quorum/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
mod connection_manager;
pub(crate) mod established_conn;
mod ledgers;
mod proxy;
mod task;

pub(crate) use connection_manager::{
ConnToMainMsg, ConnToMainMsgInner, MainToConnMsg, WireMsg,
};
pub use task::NodeTask;
pub use task::{CommitStatus, Config, NodeApiError, NodeTask, NodeTaskHandle};
Loading