Skip to content
1 change: 0 additions & 1 deletion deploy/src/bin/deployment-example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ omicron_path = "/remote/path/to/omicron"
# which server is responsible for running the rack setup service; must
# refer to one of the `servers` in the servers table
rss_server = "foo"
rack_secret_threshold = 2
# Location where files to install will be placed before running
# `omicron-package install`
#
Expand Down
39 changes: 0 additions & 39 deletions deploy/src/bin/sled-agent-overlay-files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
//! used for the trust quourm here. We generate a shared secret then split it,
//! distributing each share to the appropriate server.

use omicron_sled_agent::bootstrap::trust_quorum::{
RackSecret, ShareDistribution,
};

use anyhow::{anyhow, Context, Result};
use sp_sim::config::GimletConfig;
use sp_sim::config::SpCommonConfig;
Expand All @@ -23,45 +19,11 @@ use structopt::StructOpt;
about = "Generate server unique files for deployment"
)]
struct Args {
/// The rack secret threshold
#[structopt(short, long)]
threshold: usize,

/// A directory per server where the files are output
#[structopt(short, long)]
directories: Vec<PathBuf>,
}

// Generate a rack secret and allocate a ShareDistribution to each deployment
// server folder.
fn overlay_secret_shares(
threshold: usize,
server_dirs: &[PathBuf],
) -> Result<()> {
let total_shares = server_dirs.len();
if total_shares < 2 {
println!(
"Skipping secret share distribution: only one server \
available."
);
return Ok(());
}
let secret = RackSecret::new();
let (shares, verifier) = secret
.split(threshold, total_shares)
.map_err(|e| anyhow!("Failed to split rack secret: {:?}", e))?;
for (share, server_dir) in shares.into_iter().zip(server_dirs) {
ShareDistribution {
threshold,
total_shares,
verifier: verifier.clone(),
share,
}
.write(&server_dir)?;
}
Ok(())
}

// Generate a config file for a simulated SP in each deployment server folder.
fn overlay_sp_configs(server_dirs: &[PathBuf]) -> Result<()> {
// We will eventually need to flesh out more of this config; for now,
Expand Down Expand Up @@ -95,7 +57,6 @@ fn overlay_sp_configs(server_dirs: &[PathBuf]) -> Result<()> {

fn main() -> Result<()> {
let args = Args::from_args_safe().map_err(|err| anyhow!(err))?;
overlay_secret_shares(args.threshold, &args.directories)?;
overlay_sp_configs(&args.directories)?;
Ok(())
}
4 changes: 1 addition & 3 deletions deploy/src/bin/thing-flinger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ struct Server {
#[derive(Deserialize, Debug)]
struct Deployment {
rss_server: String,
rack_secret_threshold: usize,
staging_dir: PathBuf,
}

Expand Down Expand Up @@ -483,11 +482,10 @@ fn overlay_sled_agent(
let cmd = format!(
"sh -c 'for dir in {}; do mkdir -p $dir; done' && \
cd {} && \
cargo run {} --bin sled-agent-overlay-files -- --threshold {} --directories {}",
cargo run {} --bin sled-agent-overlay-files -- --directories {}",
dirs,
config.builder.omicron_path.to_string_lossy(),
config.release_arg(),
config.deployment.rack_secret_threshold,
dirs
);
ssh_exec(builder, &cmd, false)
Expand Down
113 changes: 46 additions & 67 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@

//! Bootstrap-related APIs.

use super::client::Client as BootstrapAgentClient;
use super::config::{Config, BOOTSTRAP_AGENT_PORT};
use super::discovery;
use super::params::SledAgentRequest;
use super::rss_handle::RssHandle;
use super::trust_quorum::{
self, RackSecret, ShareDistribution, TrustQuorumError,
};
use super::views::{ShareResponse, SledAgentResponse};
use super::trust_quorum::{RackSecret, ShareDistribution, TrustQuorumError};
use super::views::SledAgentResponse;
use crate::config::Config as SledConfig;
use crate::illumos::dladm::{self, Dladm, PhysicalLink};
use crate::illumos::zone::Zones;
Expand All @@ -22,13 +21,12 @@ use omicron_common::api::external::{Error as ExternalError, MacAddr};
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};

use slog::Logger;
use std::io;
use std::net::{Ipv6Addr, SocketAddrV6};
use std::path::{Path, PathBuf};
use thiserror::Error;
use tokio::sync::Mutex;
use vsss_rs::Share;

/// Describes errors which may occur while operating the bootstrap service.
#[derive(Error, Debug)]
Expand Down Expand Up @@ -59,28 +57,6 @@ impl From<BootstrapError> for ExternalError {
}
}

// Attempt to read a key share file. If the file does not exist, we return
// `Ok(None)`, indicating the sled is operating in a single node cluster. If
// the file exists, we parse it and return Ok(ShareDistribution). For any
// other error, we return the error.
//
// TODO: Remove after dynamic key generation. See #513.
fn read_key_share() -> Result<Option<ShareDistribution>, BootstrapError> {
let key_share_dir = Path::new("/opt/oxide/sled-agent/pkg");

match ShareDistribution::read(&key_share_dir) {
Ok(share) => Ok(Some(share)),
Err(TrustQuorumError::Io { message, err }) => {
if err.kind() == io::ErrorKind::NotFound {
Ok(None)
} else {
Err(BootstrapError::Io { message, err })
}
}
Err(e) => Err(e.into()),
}
}

/// The entity responsible for bootstrapping an Oxide rack.
pub(crate) struct Agent {
/// Debug log
Expand All @@ -89,7 +65,9 @@ pub(crate) struct Agent {
/// other launched components can set their own value.
parent_log: Logger,
peer_monitor: discovery::PeerMonitor,
share: Option<ShareDistribution>,

/// Our share of the rack secret, if we have one.
share: Mutex<Option<ShareDistribution>>,

rss: Mutex<Option<RssHandle>>,
sled_agent: Mutex<Option<SledServer>>,
Expand Down Expand Up @@ -184,12 +162,12 @@ impl Agent {
message: format!("Monitoring for peers from {address}"),
err,
})?;
let share = read_key_share()?;

let agent = Agent {
log: ba_log,
parent_log: log,
peer_monitor,
share,
share: Mutex::new(None),
rss: Mutex::new(None),
sled_agent: Mutex::new(None),
sled_config,
Expand All @@ -216,19 +194,13 @@ impl Agent {
Ok(agent)
}

/// Implements the "request share" API.
#[allow(dead_code)] // Currently uncalled; will be used soon!
pub async fn request_share(
&self,
identity: Vec<u8>,
) -> Result<ShareResponse, BootstrapError> {
// TODO-correctness: Validate identity, return whatever
// information is necessary to establish trust quorum.
//
// This current implementation is a placeholder.
info!(&self.log, "request_share, received identity: {:x?}", identity);

Ok(ShareResponse { shared_secret: vec![] })
/// Returns our share of the rack secret, if we have one.
pub async fn secret_share(&self) -> Option<Share> {
self.share
.lock()
.await
.as_ref()
.map(|share_dist| share_dist.share.clone())
}

/// Initializes the Sled Agent on behalf of the RSS, if one has not already
Expand All @@ -255,6 +227,20 @@ impl Agent {
return Err(BootstrapError::SledError(err_str));
}

// Bail out if this request includes a trust quorum share that
// doesn't match ours. TODO-correctness Do we need to handle a
// partially-initialized rack where we may have a share from a
// previously-started-but-not-completed init process? If rerunning
// it produces different shares this check will fail.
if request.trust_quorum_share != *self.share.lock().await {
let err_str = concat!(
"Sled Agent already running with",
" a different trust quorum share"
)
.to_string();
return Err(BootstrapError::SledError(err_str));
}

return Ok(SledAgentResponse { id: server.id() });
}
// Server does not exist, initialize it.
Expand All @@ -272,6 +258,9 @@ impl Agent {
maybe_agent.replace(server);
info!(&self.log, "Sled Agent loaded; recording configuration");

// Remember our share, allowing us to respond to `request_share()`.
*self.share.lock().await = request.trust_quorum_share.clone();

// Record this request so the sled agent can be automatically
// initialized on the next boot.
let path = get_sled_agent_request_path();
Expand All @@ -296,6 +285,7 @@ impl Agent {
/// sufficiently unlocked.
async fn establish_sled_quorum(
&self,
share: ShareDistribution,
) -> Result<RackSecret, BootstrapError> {
let rack_secret = retry_notify(
internal_service_policy(),
Expand All @@ -306,8 +296,6 @@ impl Agent {
"Bootstrap: Communicating with peers: {:?}", other_agents
);

let share = self.share.as_ref().unwrap();

// "-1" to account for ourselves.
if other_agents.len() < share.threshold - 1 {
warn!(
Expand All @@ -324,19 +312,21 @@ impl Agent {
);

// Retrieve verified rack_secret shares from a quorum of agents
let other_agents: Vec<trust_quorum::Client> = other_agents
let other_agents: Vec<BootstrapAgentClient> = other_agents
.into_iter()
.map(|addr| {
let addr = SocketAddrV6::new(
addr,
trust_quorum::PORT,
BOOTSTRAP_AGENT_PORT,
0,
0,
);
trust_quorum::Client::new(
&self.log,
share.verifier.clone(),
BootstrapAgentClient::new(
addr,
&self.sp,
self.log.new(o!(
"BootstrapAgentClient" => addr.to_string()),
),
)
})
.collect();
Expand All @@ -345,10 +335,10 @@ impl Agent {
// don't resend. See https://github.com/oxidecomputer/omicron/issues/514
let mut shares = vec![share.share.clone()];
for agent in &other_agents {
let share = agent.get_share().await
let share = agent.request_share().await
.map_err(|e| {
info!(&self.log, "Bootstrap: failed to retreive share from peer: {:?}", e);
BackoffError::transient(e)
BackoffError::transient(e.into())
})?;
info!(
&self.log,
Expand Down Expand Up @@ -392,17 +382,6 @@ impl Agent {
Ok(rack_secret)
}

async fn run_trust_quorum_server(&self) -> Result<(), BootstrapError> {
let my_share = self.share.as_ref().unwrap().share.clone();
let mut server = trust_quorum::Server::new(&self.log, my_share)
.map_err(|err| BootstrapError::Io {
message: "Cannot run trust quorum server".to_string(),
err,
})?;
tokio::spawn(async move { server.run().await });
Ok(())
}

// Initializes the Rack Setup Service.
async fn start_rss(&self, config: &Config) -> Result<(), BootstrapError> {
if let Some(rss_config) = &config.rss_config {
Expand All @@ -429,9 +408,9 @@ impl Agent {
) -> Result<(), BootstrapError> {
info!(&self.log, "bootstrap service initializing");

if self.share.is_some() {
self.run_trust_quorum_server().await?;
self.establish_sled_quorum().await?;
let maybe_share = self.share.lock().await.clone();
if let Some(share) = maybe_share {
self.establish_sled_quorum(share).await?;
}

self.start_rss(config).await?;
Expand Down
19 changes: 18 additions & 1 deletion sled-agent/src/bootstrap/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ use thiserror::Error;
use tokio::io::AsyncReadExt;
use tokio::io::AsyncWriteExt;
use tokio::net::TcpStream;
use vsss_rs::Share;

#[derive(Debug, Error)]
pub(crate) enum Error {
pub enum Error {
#[error("Could not connect to {addr}: {err}")]
Connect { addr: SocketAddrV6, err: io::Error },

Expand Down Expand Up @@ -79,6 +80,10 @@ impl<'a> Client<'a> {
Self { addr, sp, log }
}

pub(crate) fn addr(&self) -> SocketAddrV6 {
self.addr
}

pub(crate) async fn start_sled(
&self,
request: &SledAgentRequest,
Expand All @@ -94,6 +99,18 @@ impl<'a> Client<'a> {
}
}

pub(crate) async fn request_share(&self) -> Result<Share, Error> {
let request = Request::ShareRequest;

match self.request_response(request).await? {
Response::ShareResponse(response) => Ok(response),
Response::SledAgentResponse(_) => Err(Error::InvalidResponse {
expected: "ShareResponse",
received: "SledAgentResponse",
}),
}
}

async fn request_response(
&self,
request: Request<'_>,
Expand Down
2 changes: 0 additions & 2 deletions sled-agent/src/bootstrap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@ pub mod agent;
pub mod client;
pub mod config;
pub mod discovery;
//mod http_entrypoints;
pub mod multicast;
pub(crate) mod params;
pub(crate) mod rss_handle;
pub mod server;
mod spdm;
pub mod trust_quorum;
mod views;
Loading