diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index f6f210ec2df..ea2b2c77299 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -63,3 +63,7 @@ doc = false [[bin]] name = "sled-agent" doc = false + +[[bin]] +name = "sled-agent-overlay-files" +doc = false diff --git a/sled-agent/src/bin/sled-agent-overlay-files.rs b/sled-agent/src/bin/sled-agent-overlay-files.rs new file mode 100644 index 00000000000..a784adb2914 --- /dev/null +++ b/sled-agent/src/bin/sled-agent-overlay-files.rs @@ -0,0 +1,60 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This binary is used to generate files unique to the sled agent running on +//! each server. Specifically, the unique files we care about are key shares +//! used for the trust quourm here. We generate a shared secret then split it, +//! distributing each share to the appropriate server. + +use omicron_sled_agent::bootstrap::trust_quorum::{ + RackSecret, ShareDistribution, +}; + +use anyhow::{anyhow, Result}; +use std::path::PathBuf; +use structopt::StructOpt; + +#[derive(Debug, StructOpt)] +#[structopt( + name = "sled-agent-overlay-files", + about = "Generate server unique files for deployment" +)] +struct Args { + //// The rack secret threshold + #[structopt(short, long)] + threshold: usize, + + /// A directory per server where the files are output + #[structopt(short, long)] + directories: Vec, +} + +// Generate a rack secret and allocate a ShareDistribution to each deployment +// server folder. +fn overlay_secret_shares( + threshold: usize, + server_dirs: &[PathBuf], +) -> Result<()> { + let total_shares = server_dirs.len(); + let secret = RackSecret::new(); + let (shares, verifier) = secret + .split(threshold, total_shares) + .map_err(|e| anyhow!("Failed to split rack secret: {:?}", e))?; + for (share, server_dir) in shares.into_iter().zip(server_dirs) { + ShareDistribution { + threshold, + total_shares, + verifier: verifier.clone(), + share, + } + .write(&server_dir)?; + } + Ok(()) +} + +fn main() -> Result<()> { + let args = Args::from_args_safe().map_err(|err| anyhow!(err))?; + overlay_secret_shares(args.threshold, &args.directories)?; + Ok(()) +} diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index a0aa63e0893..eb0fa988b4e 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -4,10 +4,10 @@ //! Bootstrap-related APIs. -use super::client::types as bootstrap_types; -use super::client::Client as BootstrapClient; use super::discovery; -use super::spdm::SpdmError; +use super::trust_quorum::{ + self, RackSecret, ShareDistribution, TrustQuorumError, +}; use super::views::ShareResponse; use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ @@ -18,14 +18,11 @@ use omicron_common::packaging::sha256_digest; use slog::Logger; use std::collections::HashMap; use std::fs::File; -use std::io::{Seek, SeekFrom}; +use std::io::{self, Seek, SeekFrom}; use std::path::Path; use tar::Archive; use thiserror::Error; -const UNLOCK_THRESHOLD: usize = 1; -const BOOTSTRAP_PORT: u16 = 12346; - /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { @@ -47,11 +44,8 @@ pub enum BootstrapError { #[error("Error making HTTP request")] Api(#[from] anyhow::Error), - #[error("Error running SPDM protocol: {0}")] - Spdm(#[from] SpdmError), - - #[error("Not enough peers to unlock storage")] - NotEnoughPeers, + #[error(transparent)] + TrustQuorum(#[from] TrustQuorumError), } impl From for ExternalError { @@ -60,17 +54,41 @@ impl From for ExternalError { } } +// Attempt to read a key share file. If the file does not exist, we return +// `Ok(None)`, indicating the sled is operating in a single node cluster. If +// the file exists, we parse it and return Ok(ShareDistribution). For any +// other error, we return the error. +// +// TODO: Remove after dynamic key generation. See #513. +fn read_key_share() -> Result, BootstrapError> { + let key_share_dir = Path::new("/opt/oxide/sled-agent/pkg"); + + match ShareDistribution::read(&key_share_dir) { + Ok(share) => Ok(Some(share)), + Err(TrustQuorumError::Io(err)) => { + if err.kind() == io::ErrorKind::NotFound { + Ok(None) + } else { + Err(BootstrapError::Io(err)) + } + } + Err(e) => Err(e.into()), + } +} + /// The entity responsible for bootstrapping an Oxide rack. pub(crate) struct Agent { /// Debug log log: Logger, peer_monitor: discovery::PeerMonitor, + share: Option, } impl Agent { pub fn new(log: Logger) -> Result { let peer_monitor = discovery::PeerMonitor::new(&log)?; - Ok(Agent { log, peer_monitor }) + let share = read_key_share()?; + Ok(Agent { log, peer_monitor, share }) } /// Implements the "request share" API. @@ -89,74 +107,84 @@ impl Agent { /// Communicates with peers, sharing secrets, until the rack has been /// sufficiently unlocked. - /// - /// - This method retries until [`UNLOCK_THRESHOLD`] other agents are - /// online, and have successfully responded to "share requests". - async fn establish_sled_quorum(&self) -> Result<(), BootstrapError> { - retry_notify( + async fn establish_sled_quorum( + &self, + ) -> Result { + let rack_secret = retry_notify( internal_service_policy(), || async { let other_agents = self.peer_monitor.addrs().await; - info!(&self.log, "Bootstrap: Communicating with peers: {:?}", other_agents); + info!( + &self.log, + "Bootstrap: Communicating with peers: {:?}", other_agents + ); + + let share = self.share.as_ref().unwrap(); // "-1" to account for ourselves. - // - // NOTE: Clippy error exists while the compile-time unlock - // threshold is "1", because we basically don't require any - // peers to unlock. - #[allow(clippy::absurd_extreme_comparisons)] - if other_agents.len() < UNLOCK_THRESHOLD - 1 { - warn!(&self.log, "Not enough peers to start establishing quorum"); + if other_agents.len() < share.threshold - 1 { + warn!( + &self.log, + "Not enough peers to start establishing quorum" + ); return Err(BackoffError::Transient( - BootstrapError::NotEnoughPeers, + TrustQuorumError::NotEnoughPeers, )); } - info!(&self.log, "Bootstrap: Enough peers to start share transfer"); - - // TODO-correctness: - // - Establish trust quorum. - // - Once this is done, "unlock" local storage - // - // The current implementation sends a stub request to all known sled - // agents, but does not actually create a quorum / unlock anything. - let other_agents: Vec = other_agents + info!( + &self.log, + "Bootstrap: Enough peers to start share transfer" + ); + + // Retrieve verified rack_secret shares from a quorum of agents + let other_agents: Vec = other_agents .into_iter() .map(|mut addr| { - addr.set_port(BOOTSTRAP_PORT); - // TODO-correctness: - // - // Many rust crates - such as "URL" - really dislike - // using scopes in IPv6 addresses. Using - // "addr.to_string()" results in an IP address format - // that is rejected when embedded into a URL. - // - // Instead, we merely use IP and port for the moment, - // which loses the scope information. Longer-term, if we - // use ULAs (Unique Local Addresses) the scope shouldn't - // be a factor anyway. - let addr_str = format!("[{}]:{}", addr.ip(), addr.port()); - info!(&self.log, "bootstrap: Connecting to {}", addr_str); - BootstrapClient::new( - &format!("http://{}", addr_str), - self.log.new(o!( - "Address" => addr_str, - )), + addr.set_port(trust_quorum::PORT); + trust_quorum::Client::new( + &self.log, + share.verifier.clone(), + addr, ) }) .collect(); + + // TODO: Parallelize this and keep track of whose shares we've already retrieved and + // don't resend. See https://github.com/oxidecomputer/omicron/issues/514 + let mut shares = vec![share.share.clone()]; for agent in &other_agents { - agent - .api_request_share(&bootstrap_types::ShareRequest { - identity: vec![], - }) - .await + let share = agent.get_share().await .map_err(|e| { - info!(&self.log, "Bootstrap: Failed to share request with peer: {:?}", e); - BackoffError::Transient(BootstrapError::Api(e)) + info!(&self.log, "Bootstrap: failed to retreive share from peer: {:?}", e); + BackoffError::Transient(e) })?; - info!(&self.log, "Bootstrap: Shared request with peer"); + info!( + &self.log, + "Bootstrap: retreived share from peer: {}", + agent.addr() + ); + shares.push(share); } - Ok(()) + let rack_secret = RackSecret::combine_shares( + share.threshold, + share.total_shares, + &shares, + ) + .map_err(|e| { + warn!( + &self.log, + "Bootstrap: failed to construct rack secret: {:?}", e + ); + // TODO: We probably need to actually write an error + // handling routine that gives up in some cases based on + // the error returned from `RackSecret::combine_shares`. + // See https://github.com/oxidecomputer/omicron/issues/516 + BackoffError::Transient( + TrustQuorumError::RackSecretConstructionFailed(e), + ) + })?; + info!(self.log, "RackSecret computed from shares."); + Ok(rack_secret) }, |error, duration| { warn!( @@ -169,7 +197,7 @@ impl Agent { ) .await?; - Ok(()) + Ok(rack_secret) } async fn launch_local_services(&self) -> Result<(), BootstrapError> { @@ -200,14 +228,27 @@ impl Agent { Ok(()) } + async fn run_trust_quorum_server(&self) -> Result<(), BootstrapError> { + let my_share = self.share.as_ref().unwrap().share.clone(); + let mut server = trust_quorum::Server::new(&self.log, my_share)?; + tokio::spawn(async move { server.run().await }); + Ok(()) + } + /// Performs device initialization: /// - /// - TODO: Communicates with other sled agents to establish a trust quorum. + /// - Communicates with other sled agents to establish a trust quorum if a + /// ShareDistribution file exists on the host. Otherwise, the sled operates + /// as a single node cluster. /// - Verifies, unpacks, and launches other services. pub async fn initialize(&self) -> Result<(), BootstrapError> { info!(&self.log, "bootstrap service initializing"); - self.establish_sled_quorum().await?; + if self.share.is_some() { + self.run_trust_quorum_server().await?; + self.establish_sled_quorum().await?; + } + self.launch_local_services().await?; Ok(()) diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index b00f34086b6..14bf4d8da96 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -11,7 +11,7 @@ mod discovery; mod http_entrypoints; mod multicast; mod params; -pub mod rack_secret; pub mod server; mod spdm; +pub mod trust_quorum; mod views; diff --git a/sled-agent/src/bootstrap/spdm/error.rs b/sled-agent/src/bootstrap/spdm/error.rs index 9e2477ced19..970d4bb17a2 100644 --- a/sled-agent/src/bootstrap/spdm/error.rs +++ b/sled-agent/src/bootstrap/spdm/error.rs @@ -1,3 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + //! Wrap errors returned from the `spdm` crate and std::io::Error. use spdm::{requester::RequesterError, responder::ResponderError}; @@ -17,6 +21,9 @@ pub enum SpdmError { #[error("invalid state transition: expected {expected}, got {got}")] InvalidState { expected: &'static str, got: &'static str }, + + #[error("timeout")] + Timeout(#[from] tokio::time::error::Elapsed), } impl From for SpdmError { diff --git a/sled-agent/src/bootstrap/spdm/mod.rs b/sled-agent/src/bootstrap/spdm/mod.rs index 3303249eeed..28a5fae158c 100644 --- a/sled-agent/src/bootstrap/spdm/mod.rs +++ b/sled-agent/src/bootstrap/spdm/mod.rs @@ -1,3 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + //! Instantiate a SPDM requester and responder with particular capabilities, //! algorithms, and credentials. //! @@ -5,24 +9,29 @@ //! header for framing. mod error; -mod requester; -mod responder; +pub mod requester; +pub mod responder; use std::io::{Error, ErrorKind}; +use std::time::Duration; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use slog::Logger; use tokio::net::TcpStream; +use tokio::time::timeout; use tokio_util::codec::{Framed, LengthDelimitedCodec}; // 2^16 - 2 bytes for a header const MAX_BUF_SIZE: usize = 65534; +const TIMEOUT: Duration = Duration::from_secs(5); + pub use error::SpdmError; pub struct Transport { framed: Framed, + log: Logger, } impl Transport { @@ -31,23 +40,25 @@ impl Transport { pub const HEADER_LEN: usize = 2; #[allow(dead_code)] - pub fn new(sock: TcpStream) -> Transport { + pub fn new(sock: TcpStream, log: Logger) -> Transport { Transport { framed: LengthDelimitedCodec::builder() .length_field_length(Self::HEADER_LEN) .new_framed(sock), + log, } } pub async fn send(&mut self, data: &[u8]) -> Result<(), SpdmError> { let data = Bytes::copy_from_slice(data); - self.framed.send(data).await.map_err(|e| e.into()) + timeout(TIMEOUT, self.framed.send(data)).await??; + Ok(()) } - pub async fn recv(&mut self, log: &Logger) -> Result { - if let Some(rsp) = self.framed.next().await { + pub async fn recv(&mut self) -> Result { + if let Some(rsp) = timeout(TIMEOUT, self.framed.next()).await? { let rsp = rsp?; - debug!(log, "Received {:x?}", &rsp[..]); + debug!(self.log, "Received {:x?}", &rsp[..]); Ok(rsp) } else { Err(Error::new(ErrorKind::ConnectionAborted, "SPDM channel closed") @@ -55,3 +66,28 @@ impl Transport { } } } + +#[cfg(test)] +mod tests { + use super::*; + use std::net::SocketAddr; + use tokio::net::TcpListener; + + #[tokio::test] + async fn test_recv_timeout() { + let log = + omicron_test_utils::dev::test_setup_log("test_recv_timeout").log; + let addr: SocketAddr = "127.0.0.1:9898".parse().unwrap(); + let listener = TcpListener::bind(addr.clone()).await.unwrap(); + + let handle = tokio::spawn(async move { + let (sock, _) = listener.accept().await.unwrap(); + let mut transport = Transport::new(sock, log); + transport.recv().await + }); + + let _ = TcpStream::connect(addr).await.unwrap(); + + assert!(handle.await.unwrap().is_err()); + } +} diff --git a/sled-agent/src/bootstrap/spdm/requester.rs b/sled-agent/src/bootstrap/spdm/requester.rs index 32974754f88..374abb17c00 100644 --- a/sled-agent/src/bootstrap/spdm/requester.rs +++ b/sled-agent/src/bootstrap/spdm/requester.rs @@ -1,3 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + use slog::Logger; use spdm::msgs::algorithms::*; @@ -38,7 +42,7 @@ impl Ctx { debug!(self.log, "Requester sending GET_VERSION"); self.transport.send(data).await?; - let rsp = self.transport.recv(&self.log).await?; + let rsp = self.transport.recv().await?; debug!(self.log, "Requester received VERSION"); state.handle_msg(&rsp[..], &mut self.transcript).map_err(|e| e.into()) @@ -66,7 +70,7 @@ impl Ctx { state.write_msg(&req, &mut self.buf, &mut self.transcript)?; self.transport.send(data).await?; - let rsp = self.transport.recv(&self.log).await?; + let rsp = self.transport.recv().await?; debug!(self.log, "Requester received CAPABILITIES"); state.handle_msg(&rsp, &mut self.transcript).map_err(|e| e.into()) } @@ -100,7 +104,7 @@ impl Ctx { let data = state.write_msg(req, &mut self.buf, &mut self.transcript)?; self.transport.send(data).await?; - let rsp = self.transport.recv(&self.log).await?; + let rsp = self.transport.recv().await?; debug!(self.log, "Requester received ALGORITHMS"); state @@ -118,7 +122,10 @@ impl Ctx { /// header. Requesters and Responders are decoupled from whether the endpoint of /// a socket is a TCP client or server. #[allow(dead_code)] -pub async fn run(log: Logger, transport: Transport) -> Result<(), SpdmError> { +pub async fn run( + log: Logger, + transport: Transport, +) -> Result { let mut ctx = Ctx::new(log, transport); info!(ctx.log, "Requester starting version negotiation"); @@ -133,14 +140,12 @@ pub async fn run(log: Logger, transport: Transport) -> Result<(), SpdmError> { info!(ctx.log, "Requester completed negotiation phase"); debug!(ctx.log, "Requester transcript: {:x?}", ctx.transcript.get()); - Ok(()) + Ok(ctx.transport) } #[cfg(test)] mod tests { use std::net::SocketAddr; - - use slog::Drain; use tokio::net::{TcpListener, TcpStream}; use super::super::responder; @@ -148,24 +153,23 @@ mod tests { #[tokio::test] async fn negotiation() { - let decorator = slog_term::TermDecorator::new().build(); - let drain = slog_term::FullFormat::new(decorator).build().fuse(); - let drain = slog_async::Async::new(drain).build().fuse(); - let log = slog::Logger::root(drain, o!("component" => "spdm")); + let log = omicron_test_utils::dev::test_setup_log("negotiation").log; let log2 = log.clone(); + let log3 = log.clone(); let addr: SocketAddr = "127.0.0.1:9999".parse().unwrap(); let listener = TcpListener::bind(addr.clone()).await.unwrap(); let handle = tokio::spawn(async move { let (sock, _) = listener.accept().await.unwrap(); - let transport = Transport::new(sock); - responder::run(log, transport).await.unwrap(); + let log2 = log.clone(); + let transport = Transport::new(sock, log); + responder::run(log2, transport).await.unwrap(); }); let sock = TcpStream::connect(addr).await.unwrap(); - let transport = Transport::new(sock); - run(log2, transport).await.unwrap(); + let transport = Transport::new(sock, log2); + run(log3, transport).await.unwrap(); handle.await.unwrap(); } diff --git a/sled-agent/src/bootstrap/spdm/responder.rs b/sled-agent/src/bootstrap/spdm/responder.rs index 780fb4dd1cd..bd51723b29f 100644 --- a/sled-agent/src/bootstrap/spdm/responder.rs +++ b/sled-agent/src/bootstrap/spdm/responder.rs @@ -1,3 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + use slog::Logger; use spdm::msgs::capabilities::{Capabilities, RspFlags}; @@ -28,7 +32,7 @@ impl Ctx { &mut self, ) -> Result { let state = responder::start(); - let req = self.transport.recv(&self.log).await?; + let req = self.transport.recv().await?; let (data, state) = state.handle_msg(&req[..], &mut self.buf, &mut self.transcript)?; @@ -56,7 +60,7 @@ impl Ctx { | RspFlags::KEY_UPD_CAP, }; - let req = self.transport.recv(&self.log).await?; + let req = self.transport.recv().await?; let (data, transition) = state.handle_msg( supported, &req[..], @@ -87,7 +91,7 @@ impl Ctx { &mut self, state: algorithms::State, ) -> Result { - let req = self.transport.recv(&self.log).await?; + let req = self.transport.recv().await?; let (data, transition) = state.handle_msg(&req[..], &mut self.buf, &mut self.transcript)?; debug!(self.log, "Responder received NEGOTIATE_ALGORITHMS"); @@ -117,7 +121,10 @@ impl Ctx { /// header. Requesters and Responders are decoupled from whether the endpoint of /// a socket is a TCP client or server. #[allow(dead_code)] -pub async fn run(log: Logger, transport: Transport) -> Result<(), SpdmError> { +pub async fn run( + log: Logger, + transport: Transport, +) -> Result { let mut ctx = Ctx::new(log, transport); info!(ctx.log, "Responder starting version negotiation"); @@ -131,5 +138,5 @@ pub async fn run(log: Logger, transport: Transport) -> Result<(), SpdmError> { info!(ctx.log, "Responder completed negotiation phase"); debug!(ctx.log, "Responder transcript: {:x?}\n", ctx.transcript.get()); - Ok(()) + Ok(ctx.transport) } diff --git a/sled-agent/src/bootstrap/trust_quorum/client.rs b/sled-agent/src/bootstrap/trust_quorum/client.rs new file mode 100644 index 00000000000..30331bd6238 --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/client.rs @@ -0,0 +1,55 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::net::SocketAddr; + +use slog::Logger; +use tokio::net::TcpStream; +use vsss_rs::Share; + +use super::msgs::{Request, Response}; +use super::rack_secret::Verifier; +use super::TrustQuorumError; +use crate::bootstrap::spdm; + +pub struct Client { + log: Logger, + verifier: Verifier, + addr: SocketAddr, +} + +impl Client { + pub fn new(log: &Logger, verifier: Verifier, addr: SocketAddr) -> Client { + Client { log: log.clone(), verifier, addr } + } + + pub fn addr(&self) -> &SocketAddr { + &self.addr + } + + // Connect to a trust quorum server, establish an SPDM channel, and retrieve + // a share. + pub async fn get_share(&self) -> Result { + let sock = TcpStream::connect(&self.addr).await?; + let transport = spdm::Transport::new(sock, self.log.clone()); + + // Complete SPDM negotiation and return a secure transport + let mut transport = + spdm::requester::run(self.log.clone(), transport).await?; + + // Request a share and receive it, validating it's what we expect. + let req = bincode::serialize(&Request::Share)?; + transport.send(&req).await?; + + let rsp = transport.recv().await?; + let rsp: Response = bincode::deserialize(&rsp)?; + + let Response::Share(share) = rsp; + if self.verifier.verify(&share) { + Ok(share) + } else { + Err(TrustQuorumError::InvalidShare(self.addr)) + } + } +} diff --git a/sled-agent/src/bootstrap/trust_quorum/error.rs b/sled-agent/src/bootstrap/trust_quorum/error.rs new file mode 100644 index 00000000000..968e7ee9a25 --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/error.rs @@ -0,0 +1,34 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Error type for trust quorum code + +use super::super::spdm::SpdmError; + +use std::net::SocketAddr; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum TrustQuorumError { + #[error("Error running SPDM protocol: {0}")] + Spdm(#[from] SpdmError), + + #[error("Not enough peers to unlock storage")] + NotEnoughPeers, + + #[error("Bincode (de)serialization error: {0}")] + Bincode(#[from] Box), + + #[error("JSON (de)serialization error: {0}")] + Json(#[from] serde_json::Error), + + #[error("Invalid secret share received from {0}")] + InvalidShare(SocketAddr), + + #[error("Rack secret construction failed: {0:?}")] + RackSecretConstructionFailed(vsss_rs::Error), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), +} diff --git a/sled-agent/src/bootstrap/trust_quorum/mod.rs b/sled-agent/src/bootstrap/trust_quorum/mod.rs new file mode 100644 index 00000000000..552faf25986 --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/mod.rs @@ -0,0 +1,44 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The entry point for the trust quorum code +//! +//! The Trust quorum relies on IPv6 multicast discovery, rack secret handling, +//! and the SPDM protocol. +//! +//! Below is the trust quorum protocol for share retrieval over TCP. +//! +//! The following protocol is shown between two sleds only, but multicast +//! discovery and share requests will continue to run until enough shares +//! have been received to recreate the rack secret. +//! +//! Sled1 Sled2 +//! ===== ===== +//! || ------- Multicast Discovery --------- || +//! || || +//! || ---- Connect to TrustQuorum port ---> || +//! || || +//! || --------- SPDM Requests ------------> || +//! || || +//! || <-------- SPDM Responses ------------ || +//! || || +//! || ----- SPDM Channel Established ------ || +//! || || +//! || --------- Request Share ------------> || +//! || || +//! || <----------- Share ------------------ || +//! + +mod client; +mod error; +mod msgs; +mod rack_secret; +mod server; +mod share_distribution; + +pub use client::Client; +pub use error::TrustQuorumError; +pub use rack_secret::RackSecret; +pub use server::{Server, PORT}; +pub use share_distribution::ShareDistribution; diff --git a/sled-agent/src/bootstrap/trust_quorum/msgs.rs b/sled-agent/src/bootstrap/trust_quorum/msgs.rs new file mode 100644 index 00000000000..64841da5f3b --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/msgs.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use serde::{Deserialize, Serialize}; +use vsss_rs::Share; + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum Request { + Share, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum Response { + Share(Share), +} diff --git a/sled-agent/src/bootstrap/rack_secret.rs b/sled-agent/src/bootstrap/trust_quorum/rack_secret.rs similarity index 99% rename from sled-agent/src/bootstrap/rack_secret.rs rename to sled-agent/src/bootstrap/trust_quorum/rack_secret.rs index 8256cb191c6..ce526613b20 100644 --- a/sled-agent/src/bootstrap/rack_secret.rs +++ b/sled-agent/src/bootstrap/trust_quorum/rack_secret.rs @@ -60,7 +60,7 @@ impl Eq for RackSecret {} /// before the secret is reconstructed. // // This is just a wrapper around a FeldmanVerifier from the vsss-rs crate. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Verifier { verifier: FeldmanVerifier, } diff --git a/sled-agent/src/bootstrap/trust_quorum/server.rs b/sled-agent/src/bootstrap/trust_quorum/server.rs new file mode 100644 index 00000000000..7de85da3096 --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/server.rs @@ -0,0 +1,131 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::io; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; + +use slog::Logger; +use tokio::net::{TcpListener, TcpStream}; +use tokio::task::JoinHandle; +use vsss_rs::Share; + +use super::msgs::Response; +use super::TrustQuorumError; +use crate::bootstrap::spdm; + +// TODO: Get port from config +// TODO: Get IpAddr from local router: +// See https://github.com/oxidecomputer/omicron/issues/443 +pub const PORT: u16 = 12347; + +/// A TCP server over which a secure SPDM channel will be established and an +/// application level trust protocol will run. +pub struct Server { + log: Logger, + share: Share, + listener: TcpListener, +} + +impl Server { + pub fn new(log: &Logger, share: Share) -> io::Result { + let addr = SocketAddrV6::new(Ipv6Addr::UNSPECIFIED, PORT, 0, 0); + let sock = socket2::Socket::new( + socket2::Domain::IPV6, + socket2::Type::STREAM, + Some(socket2::Protocol::TCP), + )?; + sock.set_only_v6(true)?; + + // Allow rebinding during TIME_WAIT + sock.set_reuse_address(true)?; + + sock.bind(&addr.into())?; + sock.listen(5)?; + sock.set_nonblocking(true)?; + + Ok(Server { + log: log.clone(), + share, + listener: TcpListener::from_std(sock.into())?, + }) + } + + pub async fn run(&mut self) -> Result<(), TrustQuorumError> { + loop { + // TODO: Track the returned handles in a FuturesUnordered and log any errors? + // Alternatively, maintain some shared state across all + // responders that is accessable to the Server. + // See https://github.com/oxidecomputer/omicron/issues/517 + let _ = self.accept().await?; + } + } + + async fn accept( + &mut self, + ) -> Result>, TrustQuorumError> + { + let (sock, addr) = self.listener.accept().await?; + debug!(self.log, "Accepted connection from {}", addr); + let share = self.share.clone(); + let log = self.log.clone(); + + Ok(tokio::spawn( + async move { run_responder(log, addr, sock, share).await }, + )) + } +} + +async fn run_responder( + log: Logger, + addr: SocketAddr, + sock: TcpStream, + share: Share, +) -> Result<(), TrustQuorumError> { + let transport = spdm::Transport::new(sock, log.clone()); + + // TODO: Future code will return a secure SPDM session. For now, we just + // return the framed transport so we can send unencrypted messages. + let mut transport = spdm::responder::run(log.clone(), transport).await?; + + info!(log, "Sending share to {}", addr); + + let req = transport.recv().await?; + + // There's only one possible request + let _ = bincode::deserialize(&req)?; + + let rsp = Response::Share(share); + let rsp = bincode::serialize(&rsp)?; + transport.send(&rsp).await?; + + Ok(()) +} + +#[cfg(test)] +mod test { + use super::super::client::Client; + use super::super::rack_secret::RackSecret; + use super::*; + + #[tokio::test] + async fn send_share() { + // Create a rack secret and some shares + let secret = RackSecret::new(); + let (shares, verifier) = secret.split(2, 2).unwrap(); + + // Start a trust quorum server, but only accept one connection + let log = + omicron_test_utils::dev::test_setup_log("trust_quorum::send_share") + .log; + let mut server = Server::new(&log, shares[0].clone()).unwrap(); + let join_handle = tokio::spawn(async move { server.accept().await }); + + let client = + Client::new(&log, verifier, "[::1]:12347".parse().unwrap()); + let share = client.get_share().await.unwrap(); + assert_eq!(share, shares[0]); + + join_handle.await.unwrap().unwrap(); + } +} diff --git a/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs new file mode 100644 index 00000000000..73dc2148ee3 --- /dev/null +++ b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs @@ -0,0 +1,83 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use serde::{Deserialize, Serialize}; +use serde_json; +use std::fs; +use std::path::{Path, PathBuf}; +use vsss_rs::Share; + +use super::rack_secret::Verifier; +use super::TrustQuorumError; + +const FILENAME: &'static str = "share.json"; + +/// A ShareDistribution is an individual share of a secret along with all the +/// metadata required to allow a server in possession of the share to know how +/// to correctly recreate a split secret. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ShareDistribution { + pub threshold: usize, + pub total_shares: usize, + pub verifier: Verifier, + pub share: Share, +} + +impl ShareDistribution { + pub fn write>( + &self, + dir: P, + ) -> Result<(), TrustQuorumError> { + let mut path = PathBuf::from(dir.as_ref()); + path.push(FILENAME); + let json = serde_json::to_string(&self)?; + fs::write(path, &json)?; + Ok(()) + } + + pub fn read>( + dir: P, + ) -> Result { + let mut path = PathBuf::from(dir.as_ref()); + path.push(FILENAME); + let json = fs::read_to_string(path.to_str().unwrap())?; + serde_json::from_str(&json).map_err(|e| e.into()) + } +} + +#[cfg(test)] +mod tests { + use super::super::RackSecret; + use super::*; + + const THRESHOLD: usize = 3; + const TOTAL: usize = 5; + + fn get_share_and_verifier() -> (Share, Verifier) { + let secret = RackSecret::new(); + let (mut shares, verifier) = secret.split(THRESHOLD, TOTAL).unwrap(); + (shares.pop().unwrap(), verifier) + } + + #[test] + fn write_and_read() { + let dir = std::env::temp_dir(); + + let (share, verifier) = get_share_and_verifier(); + let share_distribution = ShareDistribution { + threshold: THRESHOLD, + total_shares: TOTAL, + verifier, + share, + }; + share_distribution.write(&dir).unwrap(); + + let read = ShareDistribution::read(&dir).unwrap(); + assert_eq!(share_distribution, read); + + let mut file = dir.clone(); + file.push(FILENAME); + std::fs::remove_file(file.as_path()).unwrap(); + } +}