From 7cb0555ff91fa47209c9abf9ba6b615d7b7d6554 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 Apr 2022 10:54:47 -0400 Subject: [PATCH 01/19] [sled-agent] Decouple RSS from Bootstrap Agent, run in distinct task --- sled-agent/src/bin/sled-agent.rs | 4 +- sled-agent/src/bootstrap/agent.rs | 189 ++-------------------- sled-agent/src/bootstrap/config.rs | 46 +----- sled-agent/src/lib.rs | 1 + sled-agent/src/rack_setup/config.rs | 52 ++++++ sled-agent/src/rack_setup/mod.rs | 8 + sled-agent/src/rack_setup/service.rs | 227 +++++++++++++++++++++++++++ 7 files changed, 304 insertions(+), 223 deletions(-) create mode 100644 sled-agent/src/rack_setup/config.rs create mode 100644 sled-agent/src/rack_setup/mod.rs create mode 100644 sled-agent/src/rack_setup/service.rs diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 6c24ed541a4..c2989ee0280 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -13,9 +13,9 @@ use omicron_common::api::external::Error; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::{ - config::Config as BootstrapConfig, config::SetupServiceConfig as RssConfig, - server as bootstrap_server, + config::Config as BootstrapConfig, server as bootstrap_server, }; +use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; use std::path::PathBuf; use structopt::StructOpt; diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index c0576b3788c..9f85cf6d1c6 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -10,6 +10,7 @@ use super::trust_quorum::{ self, RackSecret, ShareDistribution, TrustQuorumError, }; use super::views::ShareResponse; +use crate::rack_setup::service::Service as RackSetupService; use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, @@ -19,13 +20,11 @@ use slog::Logger; use std::io; use std::path::Path; use thiserror::Error; +use tokio::sync::Mutex; /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { - #[error("Cannot deserialize TOML file")] - Toml(#[from] toml::de::Error), - #[error("Error accessing filesystem: {0}")] Io(#[from] std::io::Error), @@ -35,17 +34,8 @@ pub enum BootstrapError { #[error("Error modifying SMF service: {0}")] SmfAdm(#[from] smf::AdmError), - #[error("Error making HTTP request to Sled Agent: {0}")] - SledApi(#[from] sled_agent_client::Error), - - #[error("Error making HTTP request to Nexus: {0}")] - NexusApi(#[from] nexus_client::Error), - #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), - - #[error("Configuration changed")] - Configuration, } impl From for ExternalError { @@ -82,13 +72,15 @@ pub(crate) struct Agent { log: Logger, peer_monitor: discovery::PeerMonitor, share: Option, + + rss: Mutex>, } impl Agent { pub fn new(log: Logger) -> Result { let peer_monitor = discovery::PeerMonitor::new(&log)?; let share = read_key_share()?; - Ok(Agent { log, peer_monitor, share }) + Ok(Agent { log, peer_monitor, share, rss: Mutex::new(None) }) } /// Implements the "request share" API. @@ -207,169 +199,14 @@ impl Agent { Ok(()) } - // In lieu of having an operator send requests to all sleds via an - // initialization service, the sled-agent configuration may allow for the - // automated injection of setup requests from a sled. - async fn inject_rack_setup_service_requests( - &self, - config: &Config, - ) -> Result<(), BootstrapError> { + // Initializes the Rack Setup Service. + async fn start_rss(&self, config: &Config) -> Result<(), BootstrapError> { if let Some(rss_config) = &config.rss_config { - info!(self.log, "Injecting RSS configuration: {:#?}", rss_config); - - let serialized_config = toml::Value::try_from(&config) - .expect("Cannot serialize configuration"); - let config_str = toml::to_string(&serialized_config) - .expect("Cannot turn config to string"); - - // First, check if this request has previously been made. - // - // Normally, the rack setup service is run with a human-in-the-loop, - // but with this automated injection, we need a way to determine the - // (destructive) initialization has occurred. - // - // We do this by storing the configuration at "rss_config_path" - // after successfully performing initialization. - let rss_config_path = - std::path::Path::new(crate::OMICRON_CONFIG_PATH) - .join("config-rss.toml"); - if rss_config_path.exists() { - info!( - self.log, - "RSS configuration already exists at {}", - rss_config_path.to_string_lossy() - ); - let old_config: Config = toml::from_str( - &tokio::fs::read_to_string(&rss_config_path).await?, - )?; - if &old_config == config { - info!( - self.log, - "RSS config already applied from: {}", - rss_config_path.to_string_lossy() - ); - return Ok(()); - } - - // TODO(https://github.com/oxidecomputer/omicron/issues/724): - // We could potentially handle this case by deleting all - // partitions (in preparation for applying the new - // configuration), but at the moment it's an error. - warn!( - self.log, - "Rack Setup Service Config was already applied, but has changed. - This means that you may have partitions set up on this sled, but they - may not match the ones requested by the supplied configuration.\n - To re-initialize this sled: - - Disable all Oxide services - - Delete all partitions within the attached zpool - - Delete the configuration file ({}) - - Restart the sled agent", - rss_config_path.to_string_lossy() - ); - return Err(BootstrapError::Configuration); - } else { - info!( - self.log, - "No RSS configuration found at {}", - rss_config_path.to_string_lossy() - ); - } - - // Issue the dataset initialization requests to all sleds. - futures::future::join_all( - rss_config.requests.iter().map(|request| async move { - info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), - ); - - info!(self.log, "sending partition requests..."); - for partition in &request.partitions { - let filesystem_put = || async { - info!(self.log, "creating new filesystem: {:?}", partition); - client.filesystem_put(&partition.clone().into()) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to create filesystem"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - filesystem_put, - log_failure, - ).await?; - } - Ok(()) - }) - ).await.into_iter().collect::, BootstrapError>>()?; - - // Issue service initialization requests. - // - // Note that this must happen *after* the partition initialization, - // to ensure that CockroachDB has been initialized before Nexus - // starts. - futures::future::join_all( - rss_config.requests.iter().map(|request| async move { - info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), - ); - - info!(self.log, "sending service requests..."); - let services_put = || async { - info!(self.log, "initializing sled services: {:?}", request.services); - client.services_put( - &sled_agent_client::types::ServiceEnsureBody { - services: request.services.iter().map(|s| s.clone().into()).collect() - }) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to initialize services"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - services_put, - log_failure, - ).await?; - Ok::<(), BootstrapError>(()) - }) - ).await.into_iter().collect::, BootstrapError>>()?; - - // Finally, make sure the configuration is saved so we don't inject - // the requests on the next iteration. - tokio::fs::write(rss_config_path, config_str).await?; + let rss = RackSetupService::new( + self.log.new(o!("component" => "RSS")), + rss_config.clone(), + ); + self.rss.lock().await.replace(rss); } Ok(()) } @@ -391,7 +228,7 @@ impl Agent { self.establish_sled_quorum().await?; } - self.inject_rack_setup_service_requests(config).await?; + self.start_rss(config).await?; Ok(()) } diff --git a/sled-agent/src/bootstrap/config.rs b/sled-agent/src/bootstrap/config.rs index 15ab42f8246..1fec659b5b3 100644 --- a/sled-agent/src/bootstrap/config.rs +++ b/sled-agent/src/bootstrap/config.rs @@ -4,14 +4,10 @@ //! Interfaces for working with bootstrap agent configuration -use crate::config::ConfigError; -use crate::params::{DatasetEnsureBody, ServiceRequest}; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use serde::Deserialize; use serde::Serialize; -use std::net::SocketAddr; -use std::path::Path; use uuid::Uuid; /// Configuration for a bootstrap agent @@ -21,45 +17,5 @@ pub struct Config { pub dropshot: ConfigDropshot, pub log: ConfigLogging, - pub rss_config: Option, -} - -/// Configuration for the "rack setup service", which is controlled during -/// bootstrap. -/// -/// The Rack Setup Service should be responsible for one-time setup actions, -/// such as CockroachDB placement and initialization. Without operator -/// intervention, however, these actions need a way to be automated in our -/// deployment. -/// -/// By injecting this (optional) configuration into the bootstrap agent, it -/// can act as a stand-in initialization service. -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -pub struct SetupServiceConfig { - #[serde(default, rename = "request")] - pub requests: Vec, -} - -/// A request to initialize a sled. -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -pub struct SledRequest { - /// The Sled Agent address receiving these requests. - pub sled_address: SocketAddr, - - /// Partitions to be created. - #[serde(default, rename = "partition")] - pub partitions: Vec, - - /// Services to be instantiated. - #[serde(default, rename = "service")] - pub services: Vec, -} - -impl SetupServiceConfig { - pub fn from_file>(path: P) -> Result { - let path = path.as_ref(); - let contents = std::fs::read_to_string(path)?; - let config = toml::from_str(&contents)?; - Ok(config) - } + pub rss_config: Option, } diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 245af13ab21..fb93dfc5fe1 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -27,6 +27,7 @@ mod instance; mod instance_manager; mod nexus; mod params; +pub mod rack_setup; pub mod server; mod services; mod sled_agent; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs new file mode 100644 index 00000000000..4d284cfed7b --- /dev/null +++ b/sled-agent/src/rack_setup/config.rs @@ -0,0 +1,52 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interfaces for working with RSS config. + +use crate::config::ConfigError; +use crate::params::{DatasetEnsureBody, ServiceRequest}; +use serde::Deserialize; +use serde::Serialize; +use std::net::SocketAddr; +use std::path::Path; + +/// Configuration for the "rack setup service", which is controlled during +/// bootstrap. +/// +/// The Rack Setup Service should be responsible for one-time setup actions, +/// such as CockroachDB placement and initialization. Without operator +/// intervention, however, these actions need a way to be automated in our +/// deployment. +/// +/// By injecting this (optional) configuration into the bootstrap agent, it +/// can act as a stand-in initialization service. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct SetupServiceConfig { + #[serde(default, rename = "request")] + pub requests: Vec, +} + +/// A request to initialize a sled. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct SledRequest { + /// The Sled Agent address receiving these requests. + pub sled_address: SocketAddr, + + /// Partitions to be created. + #[serde(default, rename = "partition")] + pub partitions: Vec, + + /// Services to be instantiated. + #[serde(default, rename = "service")] + pub services: Vec, +} + +impl SetupServiceConfig { + pub fn from_file>(path: P) -> Result { + let path = path.as_ref(); + let contents = std::fs::read_to_string(path)?; + let config = toml::from_str(&contents)?; + Ok(config) + } +} diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs new file mode 100644 index 00000000000..e947ff99ef0 --- /dev/null +++ b/sled-agent/src/rack_setup/mod.rs @@ -0,0 +1,8 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rack Setup Service + +pub mod config; +pub mod service; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs new file mode 100644 index 00000000000..35d46b536df --- /dev/null +++ b/sled-agent/src/rack_setup/service.rs @@ -0,0 +1,227 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rack Setup Service implementation + +use super::config::SetupServiceConfig as Config; +use omicron_common::backoff::{ + internal_service_policy, retry_notify, BackoffError, +}; +use slog::Logger; +use thiserror::Error; + +/// Describes errors which may occur while operating the setup service. +#[derive(Error, Debug)] +pub enum SetupServiceError { + #[error("Error accessing filesystem: {0}")] + Io(#[from] std::io::Error), + + #[error("Error making HTTP request to Nexus: {0}")] + NexusApi(#[from] nexus_client::Error), + + #[error("Error making HTTP request to Sled Agent: {0}")] + SledApi(#[from] sled_agent_client::Error), + + #[error("Cannot deserialize TOML file")] + Toml(#[from] toml::de::Error), + + #[error("Configuration changed")] + Configuration, +} + +/// The interface to the Rack Setup Service. +pub struct Service { + handle: tokio::task::JoinHandle>, +} + +impl Service { + pub fn new(log: Logger, config: Config) -> Self { + let handle = tokio::task::spawn(async move { + let svc = ServiceInner::new(log); + svc.inject_rack_setup_requests(&config).await + }); + + Service { handle } + } + + /// Awaits the completion of the RSS service. + pub async fn join(self) -> Result<(), SetupServiceError> { + self.handle.await.expect("Rack Setup Service Task panicked") + } +} + +/// The implementation of the Rack Setup Service. +struct ServiceInner { + log: Logger, +} + +impl ServiceInner { + pub fn new(log: Logger) -> Self { + ServiceInner { log } + } + + // In lieu of having an operator send requests to all sleds via an + // initialization service, the sled-agent configuration may allow for the + // automated injection of setup requests from a sled. + async fn inject_rack_setup_requests( + &self, + config: &Config, + ) -> Result<(), SetupServiceError> { + info!(self.log, "Injecting RSS configuration: {:#?}", config); + + let serialized_config = toml::Value::try_from(&config) + .expect("Cannot serialize configuration"); + let config_str = toml::to_string(&serialized_config) + .expect("Cannot turn config to string"); + + // First, check if this request has previously been made. + // + // Normally, the rack setup service is run with a human-in-the-loop, + // but with this automated injection, we need a way to determine the + // (destructive) initialization has occurred. + // + // We do this by storing the configuration at "rss_config_path" + // after successfully performing initialization. + let rss_config_path = std::path::Path::new(crate::OMICRON_CONFIG_PATH) + .join("config-rss.toml"); + if rss_config_path.exists() { + info!( + self.log, + "RSS configuration already exists at {}", + rss_config_path.to_string_lossy() + ); + let old_config: Config = toml::from_str( + &tokio::fs::read_to_string(&rss_config_path).await?, + )?; + if &old_config == config { + info!( + self.log, + "RSS config already applied from: {}", + rss_config_path.to_string_lossy() + ); + return Ok(()); + } + + // TODO(https://github.com/oxidecomputer/omicron/issues/724): + // We could potentially handle this case by deleting all + // partitions (in preparation for applying the new + // configuration), but at the moment it's an error. + warn!( + self.log, + "Rack Setup Service Config was already applied, but has changed. + This means that you may have partitions set up on this sled, but they + may not match the ones requested by the supplied configuration.\n + To re-initialize this sled: + - Disable all Oxide services + - Delete all partitions within the attached zpool + - Delete the configuration file ({}) + - Restart the sled agent", + rss_config_path.to_string_lossy() + ); + return Err(SetupServiceError::Configuration); + } else { + info!( + self.log, + "No RSS configuration found at {}", + rss_config_path.to_string_lossy() + ); + } + + // Issue the dataset initialization requests to all sleds. + futures::future::join_all( + config.requests.iter().map(|request| async move { + info!(self.log, "observing request: {:#?}", request); + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build() + .map_err(|e| nexus_client::Error::::from(e))?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", request.sled_address), + client, + self.log.new(o!("SledAgentClient" => request.sled_address)), + ); + + info!(self.log, "sending partition requests..."); + for partition in &request.partitions { + let filesystem_put = || async { + info!(self.log, "creating new filesystem: {:?}", partition); + client.filesystem_put(&partition.clone().into()) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to create filesystem"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + filesystem_put, + log_failure, + ).await?; + } + Ok(()) + }) + ).await.into_iter().collect::, SetupServiceError>>()?; + + // Issue service initialization requests. + // + // Note that this must happen *after* the partition initialization, + // to ensure that CockroachDB has been initialized before Nexus + // starts. + futures::future::join_all( + config.requests.iter().map(|request| async move { + info!(self.log, "observing request: {:#?}", request); + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build() + .map_err(|e| nexus_client::Error::::from(e))?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", request.sled_address), + client, + self.log.new(o!("SledAgentClient" => request.sled_address)), + ); + + info!(self.log, "sending service requests..."); + let services_put = || async { + info!(self.log, "initializing sled services: {:?}", request.services); + client.services_put( + &sled_agent_client::types::ServiceEnsureBody { + services: request.services.iter().map(|s| s.clone().into()).collect() + }) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to initialize services"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + services_put, + log_failure, + ).await?; + Ok::<(), SetupServiceError>(()) + }) + ).await.into_iter().collect::, SetupServiceError>>()?; + + // Finally, make sure the configuration is saved so we don't inject + // the requests on the next iteration. + tokio::fs::write(rss_config_path, config_str).await?; + Ok(()) + } +} From 1db163a037de26952d04caf48df629c244b2c969 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 Apr 2022 11:59:04 -0400 Subject: [PATCH 02/19] wip - made new endpoint, not impl'd yet --- sled-agent/src/bootstrap/agent.rs | 13 +++++++- sled-agent/src/bootstrap/http_entrypoints.rs | 35 ++++++++++++++++++-- sled-agent/src/bootstrap/params.rs | 12 +++++++ sled-agent/src/bootstrap/views.rs | 7 ++++ 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 9f85cf6d1c6..76fcf2d91f8 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -9,7 +9,8 @@ use super::discovery; use super::trust_quorum::{ self, RackSecret, ShareDistribution, TrustQuorumError, }; -use super::views::ShareResponse; +use super::params::SledAgentRequest; +use super::views::{SledAgentResponse, ShareResponse}; use crate::rack_setup::service::Service as RackSetupService; use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ @@ -97,6 +98,16 @@ impl Agent { Ok(ShareResponse { shared_secret: vec![] }) } + /// Initializes the Sled Agent on behalf of the RSS, if one has not already + /// been initialized. + pub async fn request_agent( + &self, + request: SledAgentRequest, + ) -> Result { + + panic!("no"); + } + /// Communicates with peers, sharing secrets, until the rack has been /// sufficiently unlocked. async fn establish_sled_quorum( diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index c8a6bde01ca..86c41f7fd44 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -34,14 +34,24 @@ use omicron_common::api::external::Error as ExternalError; use std::sync::Arc; use super::agent::Agent; -use super::{params::ShareRequest, views::ShareResponse}; +use super::{ + params::{ + ShareRequest, + SledAgentRequest, + }, + views::{ + ShareResponse, + SledAgentResponse, + }, +}; /// Returns a description of the bootstrap agent API pub(crate) fn ba_api() -> ApiDescription> { fn register_endpoints( api: &mut ApiDescription>, ) -> Result<(), String> { - api.register(api_request_share)?; + api.register(request_share)?; + api.register(start_sled)?; Ok(()) } @@ -56,7 +66,7 @@ pub(crate) fn ba_api() -> ApiDescription> { method = GET, path = "/request_share", }] -async fn api_request_share( +async fn request_share( rqctx: Arc>>, request: TypedBody, ) -> Result, HttpError> { @@ -70,3 +80,22 @@ async fn api_request_share( .map_err(|e| ExternalError::from(e))?, )) } + +#[endpoint { + method = PUT, + path = "/start_sled", +}] +async fn start_sled( + rqctx: Arc>>, + request: TypedBody, +) -> Result, HttpError> { + let bootstrap_agent = rqctx.context(); + + let request = request.into_inner(); + Ok(HttpResponseOk( + bootstrap_agent + .request_agent(request) + .await + .map_err(|e| ExternalError::from(e))?, + )) +} diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index ea9112c296f..3be2455dbdd 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -6,6 +6,8 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use omicron_common::api::external::Ipv6Net; /// Identity signed by local RoT and Oxide certificate chain. #[derive(Serialize, Deserialize, JsonSchema)] @@ -13,3 +15,13 @@ pub struct ShareRequest { // TODO-completeness: format TBD; currently opaque. pub identity: Vec, } + +/// Configuration information for launching a Sled Agent. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SledAgentRequest { + /// ID of the Sled to be initialized. + pub uuid: Uuid, + + /// Portion of the IP space to be managed by the Sled Agnet. + pub ip: Ipv6Net, +} diff --git a/sled-agent/src/bootstrap/views.rs b/sled-agent/src/bootstrap/views.rs index 5787fe0b8ae..56d3a9b80d9 100644 --- a/sled-agent/src/bootstrap/views.rs +++ b/sled-agent/src/bootstrap/views.rs @@ -6,6 +6,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use uuid::Uuid; /// Sent between bootstrap agents to establish trust quorum. #[derive(Serialize, Deserialize, JsonSchema)] @@ -13,3 +14,9 @@ pub struct ShareResponse { // TODO-completeness: format TBD; currently opaque. pub shared_secret: Vec, } + +/// Describes the Sled Agent running on the device. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SledAgentResponse { + pub id: Uuid, +} From 0b492fc55d253a264090245e92a6fe2f0ede0871 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 Apr 2022 12:10:42 -0400 Subject: [PATCH 03/19] no nexus errors --- sled-agent/src/rack_setup/service.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 35d46b536df..470725b961d 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -17,15 +17,15 @@ pub enum SetupServiceError { #[error("Error accessing filesystem: {0}")] Io(#[from] std::io::Error), - #[error("Error making HTTP request to Nexus: {0}")] - NexusApi(#[from] nexus_client::Error), - #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] sled_agent_client::Error), #[error("Cannot deserialize TOML file")] Toml(#[from] toml::de::Error), + #[error(transparent)] + Http(#[from] reqwest::Error), + #[error("Configuration changed")] Configuration, } @@ -136,8 +136,7 @@ impl ServiceInner { let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; + .build()?; let client = sled_agent_client::Client::new_with_client( &format!("http://{}", request.sled_address), client, @@ -183,8 +182,7 @@ impl ServiceInner { let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; + .build()?; let client = sled_agent_client::Client::new_with_client( &format!("http://{}", request.sled_address), client, From 475553d9f5d7298d23189c8ec11f86d4aab1a29a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Apr 2022 18:11:11 -0400 Subject: [PATCH 04/19] RSS tells bootstrap agents to start Sled Agents. WIP still. --- openapi/bootstrap-agent.json | 78 ++++++- sled-agent/src/bin/sled-agent.rs | 7 +- sled-agent/src/bootstrap/agent.rs | 60 +++++- sled-agent/src/bootstrap/discovery.rs | 135 ++++++++++++- sled-agent/src/bootstrap/mod.rs | 4 +- sled-agent/src/bootstrap/params.rs | 4 +- sled-agent/src/bootstrap/server.rs | 7 +- sled-agent/src/rack_setup/config.rs | 24 ++- sled-agent/src/rack_setup/service.rs | 279 ++++++++++++++++++-------- sled-agent/src/server.rs | 6 +- sled-agent/src/sled_agent.rs | 13 +- smf/sled-agent/config-rss.toml | 7 +- 12 files changed, 515 insertions(+), 109 deletions(-) diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 7e117a3ae17..dec4c7e6e50 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -12,7 +12,7 @@ "paths": { "/request_share": { "get": { - "operationId": "api_request_share", + "operationId": "request_share", "requestBody": { "content": { "application/json": { @@ -42,6 +42,39 @@ } } } + }, + "/start_sled": { + "put": { + "operationId": "start_sled", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledAgentRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledAgentResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } } }, "components": { @@ -77,6 +110,13 @@ "request_id" ] }, + "Ipv6Net": { + "title": "An IPv6 subnet", + "description": "An IPv6 subnet, including prefix and subnet mask", + "type": "string", + "pattern": "^(fd|FD)[0-9a-fA-F]{2}:((([0-9a-fA-F]{1,4}\\:){6}[0-9a-fA-F]{1,4})|(([0-9a-fA-F]{1,4}:){1,6}:))/(6[4-9]|[7-9][0-9]|1[0-1][0-9]|12[0-6])$", + "maxLength": 43 + }, "ShareRequest": { "description": "Identity signed by local RoT and Oxide certificate chain.", "type": "object", @@ -110,6 +150,42 @@ "required": [ "shared_secret" ] + }, + "SledAgentRequest": { + "description": "Configuration information for launching a Sled Agent.", + "type": "object", + "properties": { + "ip": { + "description": "Portion of the IP space to be managed by the Sled Agent.", + "allOf": [ + { + "$ref": "#/components/schemas/Ipv6Net" + } + ] + }, + "uuid": { + "description": "ID of the Sled to be initialized.", + "type": "string", + "format": "uuid" + } + }, + "required": [ + "ip", + "uuid" + ] + }, + "SledAgentResponse": { + "description": "Describes the Sled Agent running on the device.", + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "id" + ] } } } diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index c2989ee0280..b42979eb8fa 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -127,7 +127,8 @@ async fn do_run() -> Result<(), CmdError> { rss_config, }; let run_bootstrap = async move || -> Result<(), CmdError> { - bootstrap_server::Server::start(&bootstrap_config) + // TODO: It's a little silly to pass the config this way. + bootstrap_server::Server::start(bootstrap_config, config) .await .map_err(CmdError::Failure)? .wait_for_finish() @@ -135,6 +136,7 @@ async fn do_run() -> Result<(), CmdError> { .map_err(CmdError::Failure) }; + /* let run_sled_server = async move || -> Result<(), CmdError> { sled_server::Server::start(&config) .await @@ -143,14 +145,17 @@ async fn do_run() -> Result<(), CmdError> { .await .map_err(CmdError::Failure) }; + */ tokio::select! { Err(e) = run_bootstrap() => { eprintln!("Boot server exited unexpectedly: {:?}", e); }, + /* Err(e) = run_sled_server() => { eprintln!("Sled server exited unexpectedly: {:?}", e); }, + */ } Ok(()) } diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 76fcf2d91f8..e01dcaa2e40 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -4,6 +4,8 @@ //! Bootstrap-related APIs. +use crate::config::Config as SledConfig; +use crate::server::Server as SledServer; use super::config::Config; use super::discovery; use super::trust_quorum::{ @@ -19,7 +21,7 @@ use omicron_common::backoff::{ use slog::Logger; use std::io; -use std::path::Path; +use std::path::{Path, PathBuf}; use thiserror::Error; use tokio::sync::Mutex; @@ -35,6 +37,12 @@ pub enum BootstrapError { #[error("Error modifying SMF service: {0}")] SmfAdm(#[from] smf::AdmError), + #[error("Error starting sled agent: {0}")] + SledError(String), + + #[error(transparent)] + Toml(#[from] toml::de::Error), + #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), } @@ -75,13 +83,40 @@ pub(crate) struct Agent { share: Option, rss: Mutex>, + sled_agent: Mutex>, + sled_config: SledConfig, +} + +fn get_subnet_path() -> PathBuf { + Path::new(crate::OMICRON_CONFIG_PATH).join("subnet.toml") } impl Agent { - pub fn new(log: Logger) -> Result { + pub async fn new( + log: Logger, + sled_config: SledConfig, + ) -> Result { let peer_monitor = discovery::PeerMonitor::new(&log)?; let share = read_key_share()?; - Ok(Agent { log, peer_monitor, share, rss: Mutex::new(None) }) + let agent = Agent { + log, + peer_monitor, + share, + rss: Mutex::new(None), + sled_agent: Mutex::new(None), + sled_config, + }; + + let subnet_path = get_subnet_path(); + if subnet_path.exists() { + info!(agent.log, "Sled already configured, loading sled agent"); + let sled_request: SledAgentRequest = toml::from_str( + &tokio::fs::read_to_string(&subnet_path).await? + )?; + agent.request_agent(sled_request).await?; + } + + Ok(agent) } /// Implements the "request share" API. @@ -104,8 +139,24 @@ impl Agent { &self, request: SledAgentRequest, ) -> Result { + info!(&self.log, "Loading Sled Agent: {:?}", request); + // TODO: actually use request.ip + // TODO: actually use request.uuid + + let mut maybe_agent = self.sled_agent.lock().await; + if let Some(server) = &*maybe_agent { + // Server already exists, return it. + return Ok(SledAgentResponse { + id: server.id() + }); + } + // Server does not exist, initialize it. + let server = SledServer::start(&self.sled_config).await.map_err(|e| BootstrapError::SledError(e))?; + maybe_agent.replace(server); - panic!("no"); + Ok(SledAgentResponse { + id: self.sled_config.id, + }) } /// Communicates with peers, sharing secrets, until the rack has been @@ -216,6 +267,7 @@ impl Agent { let rss = RackSetupService::new( self.log.new(o!("component" => "RSS")), rss_config.clone(), + self.peer_monitor.observer().await, ); self.rss.lock().await.replace(rss); } diff --git a/sled-agent/src/bootstrap/discovery.rs b/sled-agent/src/bootstrap/discovery.rs index 3c87833a378..b510942ddf0 100644 --- a/sled-agent/src/bootstrap/discovery.rs +++ b/sled-agent/src/bootstrap/discovery.rs @@ -11,13 +11,31 @@ use std::io; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::sync::Arc; use tokio::net::UdpSocket; -use tokio::sync::Mutex; +use tokio::sync::{broadcast, Mutex}; use tokio::task::JoinHandle; +// NOTE: This is larger than the expected number of sleds per rack, as +// peers may change as new sleds are swapped in for old ones. +// +// See the "TODO" below about removal of sleds from the HashSet +const PEER_CAPACITY_MAXIMUM: usize = 128; + /// Manages Sled Discovery - both our announcement to other Sleds, /// as well as our discovery of those sleds. pub struct PeerMonitor { + // TODO: When can we remove sleds from this HashSet? Presumably, if a sled + // has been detached from the bootstrap network, we should drop it. + // + // Without such removal, the set size will be unbounded (though admittedly, + // growing slowly). + // + // Options: + // - Have some sort of expiration mechanism? This could turn the set of + // sleds here into "the sleds which we know were connected within the past + // hour", for example. + // - Have some other interface to identify the detachment of a peer. sleds: Arc>>, + notification_sender: broadcast::Sender, _worker: JoinHandle<()>, } @@ -27,6 +45,7 @@ async fn monitor_worker( sender: UdpSocket, listener: UdpSocket, sleds: Arc>>, + notification_sender: broadcast::Sender, ) { // Let this message be a reminder that this content is *not* // encrypted, authenticated, or otherwise verified. We're just using @@ -46,7 +65,12 @@ async fn monitor_worker( match result { Ok((_, addr)) => { info!(log, "Bootstrap Peer Monitor: Successfully received an address: {}", addr); - sleds.lock().await.insert(addr); + let mut sleds = sleds.lock().await; + if sleds.insert(addr) { + // We don't actually care if no one is listening, so + // drop the error if that's the case. + let _ = notification_sender.send(addr); + } }, Err(e) => warn!(log, "PeerMonitor failed to receive: {}", e), } @@ -76,18 +100,121 @@ impl PeerMonitor { let sleds_for_worker = sleds.clone(); let log = log.clone(); + let (tx, _) = tokio::sync::broadcast::channel(PEER_CAPACITY_MAXIMUM); + + let notification_sender = tx.clone(); let worker = tokio::task::spawn(async move { - monitor_worker(log, address, sender, listener, sleds_for_worker) + monitor_worker(log, address, sender, listener, sleds_for_worker, notification_sender) .await }); - Ok(PeerMonitor { sleds, _worker: worker }) + Ok(PeerMonitor { sleds, notification_sender: tx, _worker: worker }) } /// Returns the addresses of connected sleds. /// + /// For an interface that allows monitoring the connected sleds, rather + /// than just sampling at a single point-in-time, consider using + /// [`Self::observer`]. + /// /// Note: These sleds have not yet been verified. pub async fn addrs(&self) -> Vec { self.sleds.lock().await.iter().map(|addr| *addr).collect() } + + /// Returns a [`PeerMonitorObserver`] which can be used to view the results + /// of monitoring for peers. + pub async fn observer(&self) -> PeerMonitorObserver { + // Subscribe for notifications of new sleds right away, so + // we won't miss any notifications. + let receiver = self.notification_sender.subscribe(); + + // Next, clone the exisitng set of sleds. + // + // It's possible that we get a notification for a sled which + // exists in this set, but we handle that in + // [`PeerMonitorObserver::recv`] to avoid surfacing it to a client. + let sleds = self.sleds.lock().await.clone(); + + PeerMonitorObserver { + their_sleds: self.sleds.clone(), + our_sleds: sleds, + receiver, + } + } +} + +/// Provides a read-only view of monitored peers, with a mechanism for +/// observing the incoming queue of new peers. +pub struct PeerMonitorObserver { + // A shared reference to the "true" set of sleds. + // + // This is only used to re-synchronize our set of sleds + // if we get out-of-sync due to long notification queues. + their_sleds: Arc>>, + // A local copy of the set of sleds. This lets observers + // access + iterate over the set of sleds directly, + // without any possibility of blocking the actual monitoring task. + our_sleds: HashSet, + receiver: broadcast::Receiver, +} + +impl PeerMonitorObserver { + /// Returns the addresses of all connected sleds. + /// + /// This returns the most "up-to-date" view of peers, but a new + /// peer may be added immediately after this function returns. + /// + /// To monitor for changes, a call to [`Self::recv`] + /// can be made, to observe changes beyond an initial call to + /// [`Self::addrs`]. + pub async fn addrs(&mut self) -> &HashSet { + // First, drain the incoming queue of sled updates. + loop { + match self.receiver.try_recv() { + Ok(new_addr) => { + self.our_sleds.insert(new_addr); + } + Err(broadcast::error::TryRecvError::Empty) => break, + Err(broadcast::error::TryRecvError::Closed) => panic!("Remote closed"), + Err(broadcast::error::TryRecvError::Lagged(_)) => { + self.our_sleds = self.their_sleds.lock().await.clone(); + }, + } + } + while let Ok(new_addr) = self.receiver.try_recv() { + self.our_sleds.insert(new_addr); + } + + // Next, return the most up-to-date set of sleds. + // + // Note that this set may change immediately after `addrs()` returns, + // but a caller can see exactly what sleds were added by calling + // `recv()`. + &self.our_sleds + } + + /// Returns information about a new connected sled. + /// + /// Note that this does not provide the "initial set" of connected + /// sleds - to access that information, call [`Self::addrs`]. + /// + /// Returns [`Option::None`] if the notification queue overflowed, + /// and we needed to re-synchronize the set of sleds. + pub async fn recv(&mut self) -> Option { + loop { + match self.receiver.recv().await { + Ok(new_addr) => { + if self.our_sleds.insert(new_addr) { + return Some(new_addr); + } + } + Err(broadcast::error::RecvError::Closed) => panic!("Remote closed"), + Err(broadcast::error::RecvError::Lagged(_)) => { + self.our_sleds = self.their_sleds.lock().await.clone(); + return None; + }, + } + } + } } diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index 14bf4d8da96..d55a775c530 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -5,9 +5,9 @@ //! Bootstrap-related utilities pub mod agent; -mod client; +pub mod client; pub mod config; -mod discovery; +pub mod discovery; mod http_entrypoints; mod multicast; mod params; diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 3be2455dbdd..7f7cb703666 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -17,11 +17,11 @@ pub struct ShareRequest { } /// Configuration information for launching a Sled Agent. -#[derive(Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Serialize, Deserialize, JsonSchema)] pub struct SledAgentRequest { /// ID of the Sled to be initialized. pub uuid: Uuid, - /// Portion of the IP space to be managed by the Sled Agnet. + /// Portion of the IP space to be managed by the Sled Agent. pub ip: Ipv6Net, } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 5c8c668b5df..04e9b534e6d 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -4,6 +4,7 @@ //! Server API for bootstrap-related functionality. +use crate::config::Config as SledConfig; use super::agent::Agent; use super::config::Config; use super::http_entrypoints::ba_api as http_api; @@ -18,7 +19,7 @@ pub struct Server { } impl Server { - pub async fn start(config: &Config) -> Result { + pub async fn start(config: Config, sled_config: SledConfig) -> Result { let (drain, registration) = slog_dtrace::with_drain( config.log.to_logger("bootstrap-agent").map_err(|message| { format!("initializing logger: {}", message) @@ -39,7 +40,7 @@ impl Server { "server" => config.id.clone().to_string() )); let bootstrap_agent = - Arc::new(Agent::new(ba_log).map_err(|e| e.to_string())?); + Arc::new(Agent::new(ba_log, sled_config).await.map_err(|e| e.to_string())?); let ba = Arc::clone(&bootstrap_agent); let dropshot_log = log.new(o!("component" => "dropshot")); @@ -58,7 +59,7 @@ impl Server { // This ordering allows the bootstrap agent to communicate with // other bootstrap agents on the rack during the initialization // process. - if let Err(e) = server.bootstrap_agent.initialize(config).await { + if let Err(e) = server.bootstrap_agent.initialize(&config).await { let _ = server.close().await; return Err(e.to_string()); } diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 3174777a00e..c1d157fe913 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -8,7 +8,6 @@ use crate::config::ConfigError; use crate::params::{DatasetEnsureBody, ServiceRequest}; use serde::Deserialize; use serde::Serialize; -use std::net::SocketAddr; use std::path::Path; /// Configuration for the "rack setup service", which is controlled during @@ -23,6 +22,8 @@ use std::path::Path; /// can act as a stand-in initialization service. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct SetupServiceConfig { + pub rack_subnet: std::net::Ipv6Addr, + #[serde(default, rename = "request")] pub requests: Vec, } @@ -30,9 +31,6 @@ pub struct SetupServiceConfig { /// A request to initialize a sled. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct SledRequest { - /// The Sled Agent address receiving these requests. - pub sled_address: SocketAddr, - /// Datasets to be created. #[serde(default, rename = "dataset")] pub datasets: Vec, @@ -49,4 +47,22 @@ impl SetupServiceConfig { let config = toml::from_str(&contents)?; Ok(config) } + + pub fn az_subnet(&self) -> ipnetwork::Ipv6Network { + ipnetwork::Ipv6Network::new(self.rack_subnet, 48).unwrap() + } + + pub fn rack_subnet(&self) -> ipnetwork::Ipv6Network { + ipnetwork::Ipv6Network::new(self.rack_subnet, 56).unwrap() + } + + pub fn sled_subnet(&self, index: u8) -> ipnetwork::Ipv6Network { + let mut rack_network = self.rack_subnet().network().octets(); + + // To set bits distinguishing the /64 from the /56, we modify the 7th octet. + // + // 0001:0203:0405:0607:: + rack_network[7] = index; + ipnetwork::Ipv6Network::new(std::net::Ipv6Addr::from(rack_network), 64).unwrap() + } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 90b194cafe6..d2d54cce99d 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -4,12 +4,22 @@ //! Rack Setup Service implementation +use crate::bootstrap::client as bootstrap_agent_client; +use crate::bootstrap::discovery::PeerMonitorObserver; use super::config::SetupServiceConfig as Config; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; use slog::Logger; +use std::net::Ipv6Addr; use thiserror::Error; +use tokio::sync::Mutex; + +const SLED_AGENT_PORT: u16 = 12345; + +fn next_address(addr: Ipv6Addr) -> Ipv6Addr { + Ipv6Addr::from(u128::from(addr) + 1) +} /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] @@ -17,6 +27,9 @@ pub enum SetupServiceError { #[error("Error accessing filesystem: {0}")] Io(#[from] std::io::Error), + #[error("Error making HTTP request to Bootstrap Agent: {0}")] + BootstrapApi(#[from] bootstrap_agent_client::Error), + #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] sled_agent_client::Error), @@ -36,9 +49,16 @@ pub struct Service { } impl Service { - pub fn new(log: Logger, config: Config) -> Self { + /// Creates a new rack setup service, which runs in a background task. + /// + /// Arguments: + /// - `log`: The logger. + /// - `config`: The config file, which is used to setup the rack. + /// - `peer_monitor`: The mechanism by which the setup service discovers + /// bootstrap agents on nearby sleds. + pub fn new(log: Logger, config: Config, peer_monitor: PeerMonitorObserver) -> Self { let handle = tokio::task::spawn(async move { - let svc = ServiceInner::new(log); + let svc = ServiceInner::new(log, peer_monitor); svc.inject_rack_setup_requests(&config).await }); @@ -54,11 +74,140 @@ impl Service { /// The implementation of the Rack Setup Service. struct ServiceInner { log: Logger, + peer_monitor: Mutex, } impl ServiceInner { - pub fn new(log: Logger) -> Self { - ServiceInner { log } + fn new(log: Logger, peer_monitor: PeerMonitorObserver) -> Self { + ServiceInner { log, peer_monitor: Mutex::new(peer_monitor) } + } + + async fn initialize_sled_agent( + &self, + bootstrap_addr: std::net::SocketAddr, + subnet: ipnetwork::Ipv6Network, + ) -> Result<(), SetupServiceError> { + let dur = std::time::Duration::from_secs(60); + + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build()?; + let client = bootstrap_agent_client::Client::new_with_client( + &format!("http://{}", bootstrap_addr), + client, + self.log.new(o!("BootstrapAgentClient" => bootstrap_addr.clone())), + ); + + let sled_agent_initialize = || async { + client.start_sled(&bootstrap_agent_client::types::SledAgentRequest { + uuid: uuid::Uuid::new_v4(), // TODO: not rando + ip: bootstrap_agent_client::types::Ipv6Net(subnet.to_string()), + }).await.map_err(BackoffError::transient)?; + + Ok::< + (), + BackoffError< + bootstrap_agent_client::Error, + >, + >(()) + }; + + let log_failure = |error, _| { + warn!(self.log, "failed to start sled agent"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + sled_agent_initialize, + log_failure, + ).await?; + Ok(()) + } + + async fn initialize_datasets( + &self, + sled_address: std::net::SocketAddr, + datasets: &Vec, + ) -> Result<(), SetupServiceError> { + let dur = std::time::Duration::from_secs(60); + + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build()?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", sled_address), + client, + self.log.new(o!("SledAgentClient" => sled_address)), + ); + + info!(self.log, "sending dataset requests..."); + for dataset in datasets { + let filesystem_put = || async { + info!(self.log, "creating new filesystem: {:?}", dataset); + client.filesystem_put(&dataset.clone().into()) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to create filesystem"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + filesystem_put, + log_failure, + ).await?; + } + Ok(()) + } + + async fn initialize_services( + &self, + sled_address: std::net::SocketAddr, + services: &Vec, + ) -> Result<(), SetupServiceError> { + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build()?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", sled_address), + client, + self.log.new(o!("SledAgentClient" => sled_address)), + ); + + info!(self.log, "sending service requests..."); + let services_put = || async { + info!(self.log, "initializing sled services: {:?}", services); + client.services_put( + &sled_agent_client::types::ServiceEnsureBody { + services: services.iter().map(|s| s.clone().into()).collect() + }) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to initialize services"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + services_put, + log_failure, + ).await?; + Ok(()) } // In lieu of having an operator send requests to all sleds via an @@ -109,14 +258,10 @@ impl ServiceInner { // configuration), but at the moment it's an error. warn!( self.log, - "Rack Setup Service Config was already applied, but has changed. + "Rack Setup Service Config ({}) was already applied, but has changed. This means that you may have datasets set up on this sled, but they may not match the ones requested by the supplied configuration.\n - To re-initialize this sled: - - Disable all Oxide services - - Delete all datasets within the attached zpool - - Delete the configuration file ({}) - - Restart the sled agent", + To re-initialize this sled, re-run 'omicron-package install'.", rss_config_path.to_string_lossy() ); return Err(SetupServiceError::Configuration); @@ -128,47 +273,48 @@ impl ServiceInner { ); } + // Wait until we see enough neighbors to be able to set the + // initial set of requests. + let mut peer_monitor = self.peer_monitor.lock().await; + while peer_monitor.addrs().await.len() < config.requests.len() { + peer_monitor.recv().await; + } + + let peers = peer_monitor.addrs().await.into_iter().enumerate(); + + // XXX Questions to consider: + // - What if a sled comes online *right after* this setup? How does + // it get a /64? + // - What is the RSS fails *after* telling a BA to start a SA? + // How can it reconcile that lost address? The current scheme + // is assigning `/64`s based on the order peers have been seen. + // Issue the dataset initialization requests to all sleds. - futures::future::join_all( - config.requests.iter().map(|request| async move { + let requests = futures::future::join_all( + config.requests.iter().zip(peers).map(|(request, sled)| async move { info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build()?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), + let (idx, bootstrap_addr) = sled; + let sled_subnet_index = u8::try_from(idx + 1).expect("Too many peers!"); + + // First, connect to the Bootstrap Agent and tell it to + // initialize the Sled Agent with the specified subnet. + let subnet = config.sled_subnet(sled_subnet_index); + self.initialize_sled_agent(*bootstrap_addr, subnet).await?; + + let sled_agent_ip = next_address(subnet.ip()); + let sled_address = std::net::SocketAddr::new( + std::net::IpAddr::V6(sled_agent_ip), + SLED_AGENT_PORT, ); - info!(self.log, "sending dataset requests..."); - for dataset in &request.datasets { - let filesystem_put = || async { - info!(self.log, "creating new filesystem: {:?}", dataset); - client.filesystem_put(&dataset.clone().into()) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to create filesystem"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - filesystem_put, - log_failure, - ).await?; - } - Ok(()) + // Next, initialize any datasets on sleds that need it. + self.initialize_datasets( + sled_address, + &request.datasets, + ).await?; + Ok((request, sled_address)) }) - ).await.into_iter().collect::, SetupServiceError>>()?; + ).await.into_iter().collect::, SetupServiceError>>()?; // Issue service initialization requests. // @@ -176,44 +322,9 @@ impl ServiceInner { // to ensure that CockroachDB has been initialized before Nexus // starts. futures::future::join_all( - config.requests.iter().map(|request| async move { - info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build()?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), - ); - - info!(self.log, "sending service requests..."); - let services_put = || async { - info!(self.log, "initializing sled services: {:?}", request.services); - client.services_put( - &sled_agent_client::types::ServiceEnsureBody { - services: request.services.iter().map(|s| s.clone().into()).collect() - }) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to initialize services"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - services_put, - log_failure, - ).await?; - Ok::<(), SetupServiceError>(()) + requests.iter().map(|(request, sled_address)| async move { + self.initialize_services(*sled_address, &request.services).await?; + Ok(()) }) ).await.into_iter().collect::, SetupServiceError>>()?; diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 0c8c68331a5..917fe2202d6 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -9,7 +9,7 @@ use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use crate::nexus::NexusClient; use slog::Drain; - +use uuid::Uuid; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; @@ -23,6 +23,10 @@ pub struct Server { } impl Server { + pub fn id(&self) -> Uuid { + self.http_server.app_private().id() + } + /// Starts a SledAgent server pub async fn start(config: &Config) -> Result { let (drain, registration) = slog_dtrace::with_drain( diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index d15a4f1f468..78df9c20828 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -68,6 +68,9 @@ impl From for omicron_common::api::external::Error { /// /// Contains both a connection to the Nexus, as well as managed instances. pub struct SledAgent { + // ID of the Sled + id: Uuid, + // Component of Sled Agent responsible for storage and dataset management. storage: StorageManager, @@ -164,9 +167,17 @@ impl SledAgent { ServiceManager::new(log.clone(), config.data_link.clone(), None) .await?; - Ok(SledAgent { storage, instances, nexus_client, services }) + Ok(SledAgent { + id: config.id, + storage, + instances, + nexus_client, + services + }) } + pub fn id(&self) -> Uuid { self.id } + /// Ensures that particular services should be initialized. /// /// These services will be instantiated by this function, will be recorded diff --git a/smf/sled-agent/config-rss.toml b/smf/sled-agent/config-rss.toml index 03eb99d01bb..2497d68e416 100644 --- a/smf/sled-agent/config-rss.toml +++ b/smf/sled-agent/config-rss.toml @@ -1,7 +1,10 @@ # RSS (Rack Setup Service) "stand-in" configuration. -[[request]] -sled_address = "[fd00:1de::1]:12345" +# The /56 subnet for the rack. +# Also implies the /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:01::" # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus # should allocate crucible datasets. From e433d41734827f7c06fb12ae7681c5420a0486de Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Apr 2022 17:30:36 -0400 Subject: [PATCH 05/19] Record subnet after initialization, plumb sled agent IP --- sled-agent/src/bin/sled-agent.rs | 4 +--- sled-agent/src/bootstrap/agent.rs | 15 +++++++++++++-- sled-agent/src/config.rs | 17 +++++++++++++---- sled-agent/src/rack_setup/service.rs | 16 +++------------- sled-agent/src/server.rs | 10 +++++++--- sled-agent/src/sled_agent.rs | 3 ++- smf/sled-agent/config-rss.toml | 2 ++ smf/sled-agent/config.toml | 10 ---------- 8 files changed, 41 insertions(+), 36 deletions(-) diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index b42979eb8fa..57059090f84 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -81,10 +81,8 @@ async fn do_run() -> Result<(), CmdError> { } }, Args::Run { config_path } => { - let mut config = SledConfig::from_file(&config_path) + let config = SledConfig::from_file(&config_path) .map_err(|e| CmdError::Failure(e.to_string()))?; - config.dropshot.request_body_max_bytes = 1024 * 1024; - let config = config; // - Sled agent starts with the normal config file - typically // called "config.toml". diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 48305277346..332c12614d0 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -140,7 +140,6 @@ impl Agent { request: SledAgentRequest, ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); - // TODO: actually use request.ip // TODO: actually use request.uuid let mut maybe_agent = self.sled_agent.lock().await; @@ -151,9 +150,21 @@ impl Agent { }); } // Server does not exist, initialize it. - let server = SledServer::start(&self.sled_config).await.map_err(|e| BootstrapError::SledError(e))?; + let sled_address = crate::config::get_sled_address(request.ip); + let server = SledServer::start(&self.sled_config, sled_address) + .await + .map_err(|e| BootstrapError::SledError(e))?; maybe_agent.replace(server); + // Record the subnet, so the sled agent can be automatically + // initialized on the next boot. + tokio::fs::write( + get_subnet_path(), + &toml::to_string( + &toml::Value::try_from(&request.ip).expect("Cannot serialize IP") + ).expect("Cannot convert toml to string") + ).await?; + Ok(SledAgentResponse { id: self.sled_config.id, }) diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index 5812b89e839..18cb946fa0c 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -7,13 +7,24 @@ use crate::common::vlan::VlanID; use crate::illumos::dladm::PhysicalLink; use crate::illumos::zpool::ZpoolName; -use dropshot::ConfigDropshot; use dropshot::ConfigLogging; +use omicron_common::api::external::Ipv6Net; use serde::Deserialize; -use std::net::SocketAddr; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::path::Path; use uuid::Uuid; +pub const SLED_AGENT_PORT: u16 = 12345; + +/// Given a subnet, return the sled agent address. +pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddr { + let sled_agent_ip = Ipv6Addr::from(u128::from(subnet.ip()) + 1); + SocketAddr::new( + IpAddr::V6(sled_agent_ip), + SLED_AGENT_PORT, + ) +} + /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize)] pub struct Config { @@ -23,8 +34,6 @@ pub struct Config { pub bootstrap_address: SocketAddr, /// Address of Nexus instance pub nexus_address: SocketAddr, - /// Configuration for the sled agent dropshot server - pub dropshot: ConfigDropshot, /// Configuration for the sled agent debug log pub log: ConfigLogging, /// Optional VLAN ID to be used for tagging guest VNICs. diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index e08b7b3c6e8..46fffe3cecb 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -6,21 +6,16 @@ use crate::bootstrap::client as bootstrap_agent_client; use crate::bootstrap::discovery::PeerMonitorObserver; +use crate::config::get_sled_address; use super::config::SetupServiceConfig as Config; +use omicron_common::api::external::Ipv6Net; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; use slog::Logger; -use std::net::Ipv6Addr; use thiserror::Error; use tokio::sync::Mutex; -const SLED_AGENT_PORT: u16 = 12345; - -fn next_address(addr: Ipv6Addr) -> Ipv6Addr { - Ipv6Addr::from(u128::from(addr) + 1) -} - /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] pub enum SetupServiceError { @@ -302,13 +297,8 @@ impl ServiceInner { let subnet = config.sled_subnet(sled_subnet_index); self.initialize_sled_agent(*bootstrap_addr, subnet).await?; - let sled_agent_ip = next_address(subnet.ip()); - let sled_address = std::net::SocketAddr::new( - std::net::IpAddr::V6(sled_agent_ip), - SLED_AGENT_PORT, - ); - // Next, initialize any datasets on sleds that need it. + let sled_address = get_sled_address(Ipv6Net(subnet)); self.initialize_datasets( sled_address, &request.datasets, diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 917fe2202d6..07fe823866c 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -14,6 +14,7 @@ use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; use std::sync::Arc; +use std::net::SocketAddr; /// Packages up a [`SledAgent`], running the sled agent API under a Dropshot /// server wired up to the sled agent @@ -28,7 +29,7 @@ impl Server { } /// Starts a SledAgent server - pub async fn start(config: &Config) -> Result { + pub async fn start(config: &Config, addr: SocketAddr) -> Result { let (drain, registration) = slog_dtrace::with_drain( config.log.to_logger("sled-agent").map_err(|message| { format!("initializing logger: {}", message) @@ -54,13 +55,16 @@ impl Server { "component" => "SledAgent", "server" => config.id.clone().to_string() )); - let sled_agent = SledAgent::new(&config, sa_log, nexus_client.clone()) + let sled_agent = SledAgent::new(&config, sa_log, nexus_client.clone(), addr) .await .map_err(|e| e.to_string())?; + let mut dropshot_config = dropshot::ConfigDropshot::default(); + dropshot_config.request_body_max_bytes = 1024 * 1024; + dropshot_config.bind_address = addr; let dropshot_log = log.new(o!("component" => "dropshot")); let http_server = dropshot::HttpServerStarter::new( - &config.dropshot, + &dropshot_config, http_api(), sled_agent, &dropshot_log, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 988a46de3ba..43e4ff8688d 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -89,6 +89,7 @@ impl SledAgent { config: &Config, log: Logger, nexus_client: Arc, + sled_address: SocketAddr, ) -> Result { let id = &config.id; let vlan = config.vlan; @@ -113,7 +114,7 @@ impl SledAgent { // configuration file. Zones::ensure_has_global_zone_v6_address( config.data_link.clone(), - config.dropshot.bind_address.ip(), + sled_address.ip(), )?; // Identify all existing zones which should be managed by the Sled diff --git a/smf/sled-agent/config-rss.toml b/smf/sled-agent/config-rss.toml index 2497d68e416..b4f0b0fbcd6 100644 --- a/smf/sled-agent/config-rss.toml +++ b/smf/sled-agent/config-rss.toml @@ -6,6 +6,8 @@ # |...............| <- This /56 is the Rack Subnet rack_subnet = "fd00:1122:3344:01::" +[[request]] + # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus # should allocate crucible datasets. [[request.dataset]] diff --git a/smf/sled-agent/config.toml b/smf/sled-agent/config.toml index 96dc361254d..027f4fc774f 100644 --- a/smf/sled-agent/config.toml +++ b/smf/sled-agent/config.toml @@ -20,16 +20,6 @@ zpools = [ # $ dladm show-phys -p -o LINK # data_link = "igb0" -# Address of the Sled Agent itself -# -# With the usage of non-global zones, we no longer can use localhost addresses, -# as Nexus (within a Zone) is effectively "on a different machine" from the -# sled agent. -[dropshot] -bind_address = "[fd00:1de::1]:12345" - [log] level = "info" mode = "stderr-terminal" - - From 3b1502024c27b2d97714be9f1ee44b3cc4768622 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 11 Apr 2022 17:47:27 -0400 Subject: [PATCH 06/19] Using MAC-derived bootstrap agent addresses --- Cargo.lock | 1 + sled-agent/Cargo.toml | 1 + sled-agent/src/bin/sled-agent.rs | 57 +++--- sled-agent/src/bootstrap/agent.rs | 98 ++++++++--- sled-agent/src/bootstrap/config.rs | 2 + sled-agent/src/bootstrap/discovery.rs | 118 +++++++------ sled-agent/src/bootstrap/http_entrypoints.rs | 10 +- sled-agent/src/bootstrap/mod.rs | 2 +- sled-agent/src/bootstrap/multicast.rs | 90 ++-------- sled-agent/src/bootstrap/params.rs | 2 +- sled-agent/src/bootstrap/server.rs | 16 +- .../src/bootstrap/trust_quorum/client.rs | 10 +- sled-agent/src/config.rs | 22 ++- sled-agent/src/illumos/dladm.rs | 35 ++++ sled-agent/src/illumos/zone.rs | 17 +- sled-agent/src/lib.rs | 2 +- sled-agent/src/rack_setup/config.rs | 3 +- sled-agent/src/rack_setup/service.rs | 162 ++++++++++++------ sled-agent/src/server.rs | 20 ++- sled-agent/src/sled_agent.rs | 13 +- .../commands.rs} | 0 sled-agent/tests/integration_tests/mod.rs | 7 + .../tests/integration_tests/multicast.rs | 71 ++++++++ sled-agent/tests/mod.rs | 17 ++ smf/sled-agent/config.toml | 3 +- 25 files changed, 492 insertions(+), 287 deletions(-) rename sled-agent/tests/{test_commands.rs => integration_tests/commands.rs} (100%) create mode 100644 sled-agent/tests/integration_tests/mod.rs create mode 100644 sled-agent/tests/integration_tests/multicast.rs create mode 100644 sled-agent/tests/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 0552ea2f4a8..dd9fb302d06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2515,6 +2515,7 @@ dependencies = [ "futures", "http", "ipnetwork", + "macaddr", "mockall", "nexus-client", "omicron-common", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 6cc2069fbeb..fa3b189ce7e 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -17,6 +17,7 @@ crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] } futures = "0.3.21" ipnetwork = "0.18" +macaddr = { version = "1.0.1", features = [ "serde_std" ] } nexus-client = { path = "../nexus-client" } omicron-common = { path = "../common" } p256 = "0.9.0" diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 57059090f84..82000ceb308 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -4,8 +4,6 @@ //! Executable program to run the sled agent -#![feature(async_closure)] - use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; @@ -13,10 +11,12 @@ use omicron_common::api::external::Error; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::{ - config::Config as BootstrapConfig, server as bootstrap_server, + agent::bootstrap_address, config::Config as BootstrapConfig, + server as bootstrap_server, }; use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; +use std::net::SocketAddr; use std::path::PathBuf; use structopt::StructOpt; @@ -111,11 +111,18 @@ async fn do_run() -> Result<(), CmdError> { None }; + // Derive the bootstrap address from the data link's MAC address. + let link = config + .get_link() + .map_err(|e| CmdError::Failure(e.to_string()))?; + let bootstrap_address = bootstrap_address(link) + .map_err(|e| CmdError::Failure(e.to_string()))?; + // Configure and run the Bootstrap server. let bootstrap_config = BootstrapConfig { id: config.id, dropshot: ConfigDropshot { - bind_address: config.bootstrap_address, + bind_address: SocketAddr::V6(bootstrap_address), request_body_max_bytes: 1024 * 1024, ..Default::default() }, @@ -124,37 +131,21 @@ async fn do_run() -> Result<(), CmdError> { }, rss_config, }; - let run_bootstrap = async move || -> Result<(), CmdError> { - // TODO: It's a little silly to pass the config this way. - bootstrap_server::Server::start(bootstrap_config, config) - .await - .map_err(CmdError::Failure)? - .wait_for_finish() - .await - .map_err(CmdError::Failure) - }; - /* - let run_sled_server = async move || -> Result<(), CmdError> { - sled_server::Server::start(&config) - .await - .map_err(CmdError::Failure)? - .wait_for_finish() - .await - .map_err(CmdError::Failure) - }; - */ + // TODO: It's a little silly to pass the config this way - namely, + // that we construct the bootstrap config from `config`, but then + // pass it separately just so the sled agent can ingest it later on. + bootstrap_server::Server::start( + *bootstrap_address.ip(), + bootstrap_config, + config, + ) + .await + .map_err(CmdError::Failure)? + .wait_for_finish() + .await + .map_err(CmdError::Failure)?; - tokio::select! { - Err(e) = run_bootstrap() => { - eprintln!("Boot server exited unexpectedly: {:?}", e); - }, - /* - Err(e) = run_sled_server() => { - eprintln!("Sled server exited unexpectedly: {:?}", e); - }, - */ - } Ok(()) } } diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 332c12614d0..66e3ad20c08 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -4,23 +4,26 @@ //! Bootstrap-related APIs. -use crate::config::Config as SledConfig; -use crate::server::Server as SledServer; -use super::config::Config; +use super::config::{Config, BOOTSTRAP_AGENT_PORT}; use super::discovery; +use super::params::SledAgentRequest; use super::trust_quorum::{ self, RackSecret, ShareDistribution, TrustQuorumError, }; -use super::params::SledAgentRequest; -use super::views::{SledAgentResponse, ShareResponse}; +use super::views::{ShareResponse, SledAgentResponse}; +use crate::config::Config as SledConfig; +use crate::illumos::dladm::{self, Dladm, PhysicalLink}; +use crate::illumos::zone::{self, Zones}; use crate::rack_setup::service::Service as RackSetupService; -use omicron_common::api::external::Error as ExternalError; +use crate::server::Server as SledServer; +use omicron_common::api::external::{Error as ExternalError, MacAddr}; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; use slog::Logger; use std::io; +use std::net::{Ipv6Addr, SocketAddrV6}; use std::path::{Path, PathBuf}; use thiserror::Error; use tokio::sync::Mutex; @@ -45,6 +48,9 @@ pub enum BootstrapError { #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), + + #[error(transparent)] + Zone(#[from] zone::Error), } impl From for ExternalError { @@ -91,12 +97,44 @@ fn get_subnet_path() -> PathBuf { Path::new(omicron_common::OMICRON_CONFIG_PATH).join("subnet.toml") } +fn mac_to_socket_addr(mac: MacAddr) -> SocketAddrV6 { + let mac_bytes = mac.into_array(); + assert_eq!(6, mac_bytes.len()); + + let address = Ipv6Addr::new( + 0xfdb0, + ((mac_bytes[0] as u16) << 8) | mac_bytes[1] as u16, + ((mac_bytes[2] as u16) << 8) | mac_bytes[3] as u16, + ((mac_bytes[4] as u16) << 8) | mac_bytes[5] as u16, + 0, + 0, + 0, + 1, + ); + + SocketAddrV6::new(address, BOOTSTRAP_AGENT_PORT, 0, 0) +} + +pub fn bootstrap_address( + link: PhysicalLink, +) -> Result { + let mac = Dladm::get_mac(link)?; + Ok(mac_to_socket_addr(mac)) +} + impl Agent { pub async fn new( log: Logger, sled_config: SledConfig, + address: Ipv6Addr, ) -> Result { - let peer_monitor = discovery::PeerMonitor::new(&log)?; + Zones::ensure_has_global_zone_v6_address( + sled_config.data_link.clone(), + address, + "bootstrap6", + )?; + + let peer_monitor = discovery::PeerMonitor::new(&log, address)?; let share = read_key_share()?; let agent = Agent { log, @@ -111,7 +149,7 @@ impl Agent { if subnet_path.exists() { info!(agent.log, "Sled already configured, loading sled agent"); let sled_request: SledAgentRequest = toml::from_str( - &tokio::fs::read_to_string(&subnet_path).await? + &tokio::fs::read_to_string(&subnet_path).await?, )?; agent.request_agent(sled_request).await?; } @@ -145,9 +183,7 @@ impl Agent { let mut maybe_agent = self.sled_agent.lock().await; if let Some(server) = &*maybe_agent { // Server already exists, return it. - return Ok(SledAgentResponse { - id: server.id() - }); + return Ok(SledAgentResponse { id: server.id() }); } // Server does not exist, initialize it. let sled_address = crate::config::get_sled_address(request.ip); @@ -161,13 +197,14 @@ impl Agent { tokio::fs::write( get_subnet_path(), &toml::to_string( - &toml::Value::try_from(&request.ip).expect("Cannot serialize IP") - ).expect("Cannot convert toml to string") - ).await?; + &toml::Value::try_from(&request.ip) + .expect("Cannot serialize IP"), + ) + .expect("Cannot convert toml to string"), + ) + .await?; - Ok(SledAgentResponse { - id: self.sled_config.id, - }) + Ok(SledAgentResponse { id: self.sled_config.id }) } /// Communicates with peers, sharing secrets, until the rack has been @@ -178,7 +215,7 @@ impl Agent { let rack_secret = retry_notify( internal_service_policy(), || async { - let other_agents = self.peer_monitor.addrs().await; + let other_agents = self.peer_monitor.peer_addrs().await; info!( &self.log, "Bootstrap: Communicating with peers: {:?}", other_agents @@ -204,8 +241,13 @@ impl Agent { // Retrieve verified rack_secret shares from a quorum of agents let other_agents: Vec = other_agents .into_iter() - .map(|mut addr| { - addr.set_port(trust_quorum::PORT); + .map(|addr| { + let addr = SocketAddrV6::new( + addr, + trust_quorum::PORT, + 0, + 0, + ); trust_quorum::Client::new( &self.log, share.verifier.clone(), @@ -307,3 +349,19 @@ impl Agent { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use macaddr::MacAddr6; + + #[test] + fn test_mac_to_socket_addr() { + let mac = MacAddr("a8:40:25:10:00:01".parse::().unwrap()); + + assert_eq!( + mac_to_socket_addr(mac).ip(), + &"fdb0:a840:2510:1::1".parse::().unwrap(), + ); + } +} diff --git a/sled-agent/src/bootstrap/config.rs b/sled-agent/src/bootstrap/config.rs index 1fec659b5b3..fc8951954ec 100644 --- a/sled-agent/src/bootstrap/config.rs +++ b/sled-agent/src/bootstrap/config.rs @@ -10,6 +10,8 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; +pub const BOOTSTRAP_AGENT_PORT: u16 = 12346; + /// Configuration for a bootstrap agent #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct Config { diff --git a/sled-agent/src/bootstrap/discovery.rs b/sled-agent/src/bootstrap/discovery.rs index b510942ddf0..4cb15d75196 100644 --- a/sled-agent/src/bootstrap/discovery.rs +++ b/sled-agent/src/bootstrap/discovery.rs @@ -8,7 +8,7 @@ use super::multicast; use slog::Logger; use std::collections::HashSet; use std::io; -use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; +use std::net::{Ipv6Addr, SocketAddr}; use std::sync::Arc; use tokio::net::UdpSocket; use tokio::sync::{broadcast, Mutex}; @@ -34,18 +34,18 @@ pub struct PeerMonitor { // sleds here into "the sleds which we know were connected within the past // hour", for example. // - Have some other interface to identify the detachment of a peer. - sleds: Arc>>, - notification_sender: broadcast::Sender, + our_address: Ipv6Addr, + sleds: Arc>>, + notification_sender: broadcast::Sender, _worker: JoinHandle<()>, } async fn monitor_worker( log: Logger, - address: SocketAddrV6, sender: UdpSocket, listener: UdpSocket, - sleds: Arc>>, - notification_sender: broadcast::Sender, + sleds: Arc>>, + notification_sender: broadcast::Sender, ) { // Let this message be a reminder that this content is *not* // encrypted, authenticated, or otherwise verified. We're just using @@ -56,20 +56,24 @@ async fn monitor_worker( let mut buf = vec![0u8; 128]; tokio::select! { _ = tokio::time::sleep(tokio::time::Duration::from_millis(5000)) => { - trace!(log, "Bootstrap Peer Monitor: Broadcasting our own address: {}", address); - if let Err(e) = sender.try_send_to(message, address.into()) { + if let Err(e) = sender.try_send_to(message, SocketAddr::V6(multicast::multicast_address())) { warn!(log, "PeerMonitor failed to broadcast: {}", e); } } result = listener.recv_from(&mut buf) => { match result { Ok((_, addr)) => { - info!(log, "Bootstrap Peer Monitor: Successfully received an address: {}", addr); - let mut sleds = sleds.lock().await; - if sleds.insert(addr) { - // We don't actually care if no one is listening, so - // drop the error if that's the case. - let _ = notification_sender.send(addr); + match addr { + SocketAddr::V6(addr) => { + let mut sleds = sleds.lock().await; + if sleds.insert(*addr.ip()) { + info!(log, "Bootstrap Peer Monitor: Successfully received an address: {}", addr); + // We don't actually care if no one is listening, so + // drop the error if that's the case. + let _ = notification_sender.send(*addr.ip()); + } + } + _ => continue, } }, Err(e) => warn!(log, "PeerMonitor failed to receive: {}", e), @@ -81,16 +85,7 @@ async fn monitor_worker( impl PeerMonitor { /// Creates a new [`PeerMonitor`]. - // TODO: Address, port, interface, etc, probably should be - // configuration options. - pub fn new(log: &Logger) -> Result { - let scope = multicast::Ipv6MulticastScope::LinkLocal.first_hextet(); - let address = SocketAddrV6::new( - Ipv6Addr::new(scope, 0, 0, 0, 0, 0, 0, 0x1), - 7645, - 0, - 0, - ); + pub fn new(log: &Logger, address: Ipv6Addr) -> Result { let loopback = false; let interface = 0; let (sender, listener) = @@ -104,11 +99,22 @@ impl PeerMonitor { let notification_sender = tx.clone(); let worker = tokio::task::spawn(async move { - monitor_worker(log, address, sender, listener, sleds_for_worker, notification_sender) - .await + monitor_worker( + log, + sender, + listener, + sleds_for_worker, + notification_sender, + ) + .await }); - Ok(PeerMonitor { sleds, notification_sender: tx, _worker: worker }) + Ok(PeerMonitor { + our_address: address, + sleds, + notification_sender: tx, + _worker: worker, + }) } /// Returns the addresses of connected sleds. @@ -118,7 +124,7 @@ impl PeerMonitor { /// [`Self::observer`]. /// /// Note: These sleds have not yet been verified. - pub async fn addrs(&self) -> Vec { + pub async fn peer_addrs(&self) -> Vec { self.sleds.lock().await.iter().map(|addr| *addr).collect() } @@ -137,8 +143,9 @@ impl PeerMonitor { let sleds = self.sleds.lock().await.clone(); PeerMonitorObserver { - their_sleds: self.sleds.clone(), - our_sleds: sleds, + our_address: self.our_address, + actual_sleds: self.sleds.clone(), + observed_sleds: sleds, receiver, } } @@ -147,73 +154,86 @@ impl PeerMonitor { /// Provides a read-only view of monitored peers, with a mechanism for /// observing the incoming queue of new peers. pub struct PeerMonitorObserver { + our_address: Ipv6Addr, // A shared reference to the "true" set of sleds. // // This is only used to re-synchronize our set of sleds // if we get out-of-sync due to long notification queues. - their_sleds: Arc>>, + actual_sleds: Arc>>, // A local copy of the set of sleds. This lets observers // access + iterate over the set of sleds directly, // without any possibility of blocking the actual monitoring task. - our_sleds: HashSet, - receiver: broadcast::Receiver, + observed_sleds: HashSet, + receiver: broadcast::Receiver, } impl PeerMonitorObserver { - /// Returns the addresses of all connected sleds. + /// Returns the address of this sled. + pub fn our_address(&self) -> Ipv6Addr { + self.our_address + } + + /// Returns the addresses of all connected sleds, excluding + /// our own. /// /// This returns the most "up-to-date" view of peers, but a new /// peer may be added immediately after this function returns. /// /// To monitor for changes, a call to [`Self::recv`] /// can be made, to observe changes beyond an initial call to - /// [`Self::addrs`]. - pub async fn addrs(&mut self) -> &HashSet { + /// [`Self::peer_addrs`]. + pub async fn peer_addrs(&mut self) -> &HashSet { // First, drain the incoming queue of sled updates. loop { match self.receiver.try_recv() { Ok(new_addr) => { - self.our_sleds.insert(new_addr); + self.observed_sleds.insert(new_addr); } Err(broadcast::error::TryRecvError::Empty) => break, - Err(broadcast::error::TryRecvError::Closed) => panic!("Remote closed"), + Err(broadcast::error::TryRecvError::Closed) => { + panic!("Remote closed") + } Err(broadcast::error::TryRecvError::Lagged(_)) => { - self.our_sleds = self.their_sleds.lock().await.clone(); - }, + self.observed_sleds = + self.actual_sleds.lock().await.clone(); + } } } while let Ok(new_addr) = self.receiver.try_recv() { - self.our_sleds.insert(new_addr); + self.observed_sleds.insert(new_addr); } // Next, return the most up-to-date set of sleds. // - // Note that this set may change immediately after `addrs()` returns, + // Note that this set may change immediately after `peer_addrs()` returns, // but a caller can see exactly what sleds were added by calling // `recv()`. - &self.our_sleds + &self.observed_sleds } /// Returns information about a new connected sled. /// /// Note that this does not provide the "initial set" of connected - /// sleds - to access that information, call [`Self::addrs`]. + /// sleds - to access that information, call [`Self::peer_addrs`]. /// /// Returns [`Option::None`] if the notification queue overflowed, /// and we needed to re-synchronize the set of sleds. - pub async fn recv(&mut self) -> Option { + pub async fn recv(&mut self) -> Option { loop { match self.receiver.recv().await { Ok(new_addr) => { - if self.our_sleds.insert(new_addr) { + if self.observed_sleds.insert(new_addr) { return Some(new_addr); } } - Err(broadcast::error::RecvError::Closed) => panic!("Remote closed"), + Err(broadcast::error::RecvError::Closed) => { + panic!("Remote closed") + } Err(broadcast::error::RecvError::Lagged(_)) => { - self.our_sleds = self.their_sleds.lock().await.clone(); + self.observed_sleds = + self.actual_sleds.lock().await.clone(); return None; - }, + } } } } diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index 86c41f7fd44..8c2952a8054 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -35,14 +35,8 @@ use std::sync::Arc; use super::agent::Agent; use super::{ - params::{ - ShareRequest, - SledAgentRequest, - }, - views::{ - ShareResponse, - SledAgentResponse, - }, + params::{ShareRequest, SledAgentRequest}, + views::{ShareResponse, SledAgentResponse}, }; /// Returns a description of the bootstrap agent API diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index d55a775c530..fd1c317d338 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -9,7 +9,7 @@ pub mod client; pub mod config; pub mod discovery; mod http_entrypoints; -mod multicast; +pub mod multicast; mod params; pub mod server; mod spdm; diff --git a/sled-agent/src/bootstrap/multicast.rs b/sled-agent/src/bootstrap/multicast.rs index 78a611ff978..5bf69d3cc10 100644 --- a/sled-agent/src/bootstrap/multicast.rs +++ b/sled-agent/src/bootstrap/multicast.rs @@ -81,13 +81,7 @@ fn new_ipv6_udp_listener( socket.set_reuse_address(true)?; socket.join_multicast_v6(addr.ip(), interface)?; - - // TODO: I tried binding on the input value of "addr.ip()", but doing so - // returns errno 22 ("Invalid Input"). - // - // This may be binding to a larger address range than we want. - let bind_address = - SocketAddrV6::new(Ipv6Addr::UNSPECIFIED, addr.port(), 0, 0); + let bind_address = SocketAddrV6::new(*addr.ip(), addr.port(), 0, 0); socket.bind(&(bind_address).into())?; // Convert from: socket2 -> std -> tokio @@ -96,99 +90,41 @@ fn new_ipv6_udp_listener( /// Create a new sending socket, capable of sending IPv6 multicast traffic. fn new_ipv6_udp_sender( + addr: &Ipv6Addr, loopback: bool, interface: u32, ) -> io::Result { let socket = new_ipv6_udp_socket()?; socket.set_multicast_loop_v6(loopback)?; socket.set_multicast_if_v6(interface)?; - let address = SocketAddrV6::new(Ipv6Addr::UNSPECIFIED, 0, 0, 0); + let address = SocketAddrV6::new(*addr, 0, 0, 0); socket.bind(&address.into())?; UdpSocket::from_std(std::net::UdpSocket::from(socket)) } +pub fn multicast_address() -> SocketAddrV6 { + let scope = Ipv6MulticastScope::LinkLocal.first_hextet(); + SocketAddrV6::new(Ipv6Addr::new(scope, 0, 0, 0, 0, 0, 0, 0x1), 7645, 0, 0) +} + /// Returns the (sender, receiver) sockets of an IPv6 UDP multicast group. /// -/// * `address`: The address to use. Consider a value from: -/// , -/// and the [`Ipv6MulticastScope`] helper to provide the first hextet. +/// * `address`: The address to use for sending. /// * `loopback`: If true, the receiver packet will see multicast packets sent /// on our sender, in addition to those sent by everyone else in the multicast /// group. /// * `interface`: The index of the interface to join (zero indicates "any /// interface"). pub fn new_ipv6_udp_pair( - address: &SocketAddrV6, + address: &Ipv6Addr, loopback: bool, interface: u32, ) -> io::Result<(UdpSocket, UdpSocket)> { - let sender = new_ipv6_udp_sender(loopback, interface)?; - let listener = new_ipv6_udp_listener(address, interface)?; + let sender = new_ipv6_udp_sender(&address, loopback, interface)?; + let listener = new_ipv6_udp_listener(&multicast_address(), interface)?; Ok((sender, listener)) } -#[cfg(test)] -mod test { - use super::*; - - // NOTE: This test is ignored by default - it relies on a networking - // setup that isn't consistent between our automated test infrastructure. - // It can still be run locally with: - // - // $ cargo test -p omicron-sled-agent -- --ignored - #[tokio::test] - #[ignore] - async fn test_multicast_ipv6() { - let message = b"Hello World!"; - let scope = Ipv6MulticastScope::LinkLocal.first_hextet(); - let address = SocketAddrV6::new( - Ipv6Addr::new(scope, 0, 0, 0, 0, 0, 0, 0x1), - 7645, - 0, - 0, - ); - - // For this test, we want to see our own transmission. - // Unlike most usage in the Sled Agent, this means we want - // loopback to be enabled. - let loopback = true; - let interface = 0; - let (sender, listener) = - new_ipv6_udp_pair(&address, loopback, interface).unwrap(); - - // Create a receiver task which reads for messages that have - // been broadcast, verifies the message, and returns the - // calling address. - let receiver_task_handle = tokio::task::spawn(async move { - let mut buf = vec![0u8; 32]; - let (len, addr) = listener.recv_from(&mut buf).await?; - assert_eq!(message.len(), len); - assert_eq!(message, &buf[..message.len()]); - Ok::<_, io::Error>(addr) - }); - - // Send a message repeatedly, and exit successfully if we - // manage to receive the response. - tokio::pin!(receiver_task_handle); - let mut send_count = 0; - loop { - tokio::select! { - result = sender.send_to(message, address) => { - assert_eq!(message.len(), result.unwrap()); - send_count += 1; - if send_count > 10 { - panic!("10 multicast UDP messages sent with no response"); - } - tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; - } - result = &mut receiver_task_handle => { - let addr = result.unwrap().unwrap(); - eprintln!("Receiver received message: {:#?}", addr); - break; - } - } - } - } -} +// Refer to sled-agent/tests/integration_tests/multicast.rs for tests. diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 7f7cb703666..d439bdadba4 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -4,10 +4,10 @@ //! Request body types for the bootstrap agent +use omicron_common::api::external::Ipv6Net; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use omicron_common::api::external::Ipv6Net; /// Identity signed by local RoT and Oxide certificate chain. #[derive(Serialize, Deserialize, JsonSchema)] diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 04e9b534e6d..ab34d13b1dd 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -4,11 +4,12 @@ //! Server API for bootstrap-related functionality. -use crate::config::Config as SledConfig; use super::agent::Agent; use super::config::Config; use super::http_entrypoints::ba_api as http_api; +use crate::config::Config as SledConfig; use slog::Drain; +use std::net::Ipv6Addr; use std::sync::Arc; /// Wraps a [Agent] object, and provides helper methods for exposing it @@ -19,7 +20,11 @@ pub struct Server { } impl Server { - pub async fn start(config: Config, sled_config: SledConfig) -> Result { + pub async fn start( + address: Ipv6Addr, + config: Config, + sled_config: SledConfig, + ) -> Result { let (drain, registration) = slog_dtrace::with_drain( config.log.to_logger("bootstrap-agent").map_err(|message| { format!("initializing logger: {}", message) @@ -39,8 +44,11 @@ impl Server { "component" => "BootstrapAgent", "server" => config.id.clone().to_string() )); - let bootstrap_agent = - Arc::new(Agent::new(ba_log, sled_config).await.map_err(|e| e.to_string())?); + let bootstrap_agent = Arc::new( + Agent::new(ba_log, sled_config, address) + .await + .map_err(|e| e.to_string())?, + ); let ba = Arc::clone(&bootstrap_agent); let dropshot_log = log.new(o!("component" => "dropshot")); diff --git a/sled-agent/src/bootstrap/trust_quorum/client.rs b/sled-agent/src/bootstrap/trust_quorum/client.rs index 30331bd6238..7eb1ff2808b 100644 --- a/sled-agent/src/bootstrap/trust_quorum/client.rs +++ b/sled-agent/src/bootstrap/trust_quorum/client.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::net::SocketAddr; +use std::net::{SocketAddr, SocketAddrV6}; use slog::Logger; use tokio::net::TcpStream; @@ -16,15 +16,15 @@ use crate::bootstrap::spdm; pub struct Client { log: Logger, verifier: Verifier, - addr: SocketAddr, + addr: SocketAddrV6, } impl Client { - pub fn new(log: &Logger, verifier: Verifier, addr: SocketAddr) -> Client { + pub fn new(log: &Logger, verifier: Verifier, addr: SocketAddrV6) -> Client { Client { log: log.clone(), verifier, addr } } - pub fn addr(&self) -> &SocketAddr { + pub fn addr(&self) -> &SocketAddrV6 { &self.addr } @@ -49,7 +49,7 @@ impl Client { if self.verifier.verify(&share) { Ok(share) } else { - Err(TrustQuorumError::InvalidShare(self.addr)) + Err(TrustQuorumError::InvalidShare(SocketAddr::V6(self.addr))) } } } diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index 18cb946fa0c..ea0b4db9ead 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -5,24 +5,21 @@ //! Interfaces for working with sled agent configuration use crate::common::vlan::VlanID; -use crate::illumos::dladm::PhysicalLink; +use crate::illumos::dladm::{self, Dladm, PhysicalLink}; use crate::illumos::zpool::ZpoolName; use dropshot::ConfigLogging; use omicron_common::api::external::Ipv6Net; use serde::Deserialize; -use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::Path; use uuid::Uuid; pub const SLED_AGENT_PORT: u16 = 12345; /// Given a subnet, return the sled agent address. -pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddr { +pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddrV6 { let sled_agent_ip = Ipv6Addr::from(u128::from(subnet.ip()) + 1); - SocketAddr::new( - IpAddr::V6(sled_agent_ip), - SLED_AGENT_PORT, - ) + SocketAddrV6::new(sled_agent_ip, SLED_AGENT_PORT, 0, 0) } /// Configuration for a sled agent @@ -30,8 +27,6 @@ pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddr { pub struct Config { /// Unique id for the sled pub id: Uuid, - /// Address of the Bootstrap Agent interface. - pub bootstrap_address: SocketAddr, /// Address of Nexus instance pub nexus_address: SocketAddr, /// Configuration for the sled agent debug log @@ -62,4 +57,13 @@ impl Config { let config = toml::from_str(&contents)?; Ok(config) } + + pub fn get_link(&self) -> Result { + let link = if let Some(link) = self.data_link.clone() { + link + } else { + Dladm::find_physical()? + }; + Ok(link) + } } diff --git a/sled-agent/src/illumos/dladm.rs b/sled-agent/src/illumos/dladm.rs index ed99ad46cbb..eeb70358428 100644 --- a/sled-agent/src/illumos/dladm.rs +++ b/sled-agent/src/illumos/dladm.rs @@ -8,6 +8,7 @@ use crate::common::vlan::VlanID; use crate::illumos::{execute, ExecutionError, PFEXEC}; use omicron_common::api::external::MacAddr; use serde::{Deserialize, Serialize}; +use std::str::FromStr; pub const VNIC_PREFIX: &str = "ox"; pub const VNIC_PREFIX_CONTROL: &str = "oxControl"; @@ -24,6 +25,9 @@ pub enum Error { #[error("Failed to parse output: {0}")] Parse(#[from] std::string::FromUtf8Error), + + #[error("Failed to parse MAC: {0}")] + ParseMac(#[from] macaddr::ParseError), } /// The name of a physical datalink. @@ -52,6 +56,37 @@ impl Dladm { Ok(PhysicalLink(name)) } + /// Returns the MAC address of a physical link. + pub fn get_mac(link: PhysicalLink) -> Result { + let mut command = std::process::Command::new(PFEXEC); + let cmd = command.args(&[ + DLADM, + "show-phys", + "-m", + "-p", + "-o", + "ADDRESS", + &link.0, + ]); + let output = execute(cmd)?; + let name = String::from_utf8(output.stdout)? + .lines() + .next() + .map(|s| s.trim()) + .ok_or_else(|| Error::NotFound)? + .to_string(); + + // Ensure the MAC address is zero-padded, so it may be parsed as a + // MacAddr. This converts segments like ":a" to ":0a". + let name = name + .split(":") + .map(|segment| format!("{:0>2}", segment)) + .collect::>() + .join(":"); + let mac = MacAddr::from_str(&name)?; + Ok(mac) + } + /// Creates a new VNIC atop a physical device. /// /// * `physical`: The physical link on top of which a device will be diff --git a/sled-agent/src/illumos/zone.rs b/sled-agent/src/illumos/zone.rs index 6ce91132b64..24338f99a82 100644 --- a/sled-agent/src/illumos/zone.rs +++ b/sled-agent/src/illumos/zone.rs @@ -6,7 +6,7 @@ use ipnetwork::IpNetwork; use slog::Logger; -use std::net::IpAddr; +use std::net::{IpAddr, Ipv6Addr}; use crate::illumos::addrobj::AddrObject; use crate::illumos::dladm::{Dladm, PhysicalLink, VNIC_PREFIX_CONTROL}; @@ -413,15 +413,10 @@ impl Zones { // from RSS. pub fn ensure_has_global_zone_v6_address( physical_link: Option, - address: IpAddr, + address: Ipv6Addr, + name: &str, ) -> Result<(), Error> { - if !address.is_ipv6() { - return Err(Error::Ip(address.into())); - } - - // Ensure that addrconf has been set up in the Global - // Zone. - + // Ensure that addrconf has been set up in the Global Zone. let link = if let Some(link) = physical_link { link } else { @@ -438,8 +433,8 @@ impl Zones { // prefix. Anything else must be routed through Sidecar. Self::ensure_address( None, - &gz_link_local_addrobj.on_same_interface("sled6")?, - AddressRequest::new_static(address, Some(64)), + &gz_link_local_addrobj.on_same_interface(name)?, + AddressRequest::new_static(IpAddr::V6(address), Some(64)), )?; Ok(()) } diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 89eec21bd7f..c1ecae6a5e2 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -22,7 +22,7 @@ pub mod common; pub mod bootstrap; pub mod config; mod http_entrypoints; -mod illumos; +pub mod illumos; mod instance; mod instance_manager; mod nexus; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index c1d157fe913..e036ace1b8f 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -63,6 +63,7 @@ impl SetupServiceConfig { // // 0001:0203:0405:0607:: rack_network[7] = index; - ipnetwork::Ipv6Network::new(std::net::Ipv6Addr::from(rack_network), 64).unwrap() + ipnetwork::Ipv6Network::new(std::net::Ipv6Addr::from(rack_network), 64) + .unwrap() } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 46fffe3cecb..9213bb9b686 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -4,15 +4,18 @@ //! Rack Setup Service implementation -use crate::bootstrap::client as bootstrap_agent_client; +use super::config::SetupServiceConfig as Config; use crate::bootstrap::discovery::PeerMonitorObserver; +use crate::bootstrap::{ + client as bootstrap_agent_client, config::BOOTSTRAP_AGENT_PORT, +}; use crate::config::get_sled_address; -use super::config::SetupServiceConfig as Config; use omicron_common::api::external::Ipv6Net; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; use slog::Logger; +use std::net::{SocketAddr, SocketAddrV6}; use thiserror::Error; use tokio::sync::Mutex; @@ -23,7 +26,10 @@ pub enum SetupServiceError { Io(#[from] std::io::Error), #[error("Error making HTTP request to Bootstrap Agent: {0}")] - BootstrapApi(#[from] bootstrap_agent_client::Error), + BootstrapApi( + #[from] + bootstrap_agent_client::Error, + ), #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] sled_agent_client::Error), @@ -51,7 +57,11 @@ impl Service { /// - `config`: The config file, which is used to setup the rack. /// - `peer_monitor`: The mechanism by which the setup service discovers /// bootstrap agents on nearby sleds. - pub fn new(log: Logger, config: Config, peer_monitor: PeerMonitorObserver) -> Self { + pub fn new( + log: Logger, + config: Config, + peer_monitor: PeerMonitorObserver, + ) -> Self { let handle = tokio::task::spawn(async move { let svc = ServiceInner::new(log, peer_monitor); svc.inject_rack_setup_requests(&config).await @@ -79,7 +89,7 @@ impl ServiceInner { async fn initialize_sled_agent( &self, - bootstrap_addr: std::net::SocketAddr, + bootstrap_addr: SocketAddrV6, subnet: ipnetwork::Ipv6Network, ) -> Result<(), SetupServiceError> { let dur = std::time::Duration::from_secs(60); @@ -88,22 +98,38 @@ impl ServiceInner { .connect_timeout(dur) .timeout(dur) .build()?; + + // TODO: Can we just use a type that avoids the need for this + // conversion? + let url = format!( + "http://[{}]:{}", + bootstrap_addr.ip(), + BOOTSTRAP_AGENT_PORT, + ); + info!(self.log, "Sending request to peer agent: {}", url); let client = bootstrap_agent_client::Client::new_with_client( - &format!("http://{}", bootstrap_addr), + &url, client, - self.log.new(o!("BootstrapAgentClient" => bootstrap_addr.clone())), + self.log.new(o!("BootstrapAgentClient" => url.clone())), ); let sled_agent_initialize = || async { - client.start_sled(&bootstrap_agent_client::types::SledAgentRequest { - uuid: uuid::Uuid::new_v4(), // TODO: not rando - ip: bootstrap_agent_client::types::Ipv6Net(subnet.to_string()), - }).await.map_err(BackoffError::transient)?; + client + .start_sled(&bootstrap_agent_client::types::SledAgentRequest { + uuid: uuid::Uuid::new_v4(), // TODO: not rando + ip: bootstrap_agent_client::types::Ipv6Net( + subnet.to_string(), + ), + }) + .await + .map_err(BackoffError::transient)?; Ok::< (), BackoffError< - bootstrap_agent_client::Error, + bootstrap_agent_client::Error< + bootstrap_agent_client::types::Error, + >, >, >(()) }; @@ -115,13 +141,15 @@ impl ServiceInner { internal_service_policy(), sled_agent_initialize, log_failure, - ).await?; + ) + .await?; + info!(self.log, "Peer agent at {} initialized", url); Ok(()) } async fn initialize_datasets( &self, - sled_address: std::net::SocketAddr, + sled_address: SocketAddr, datasets: &Vec, ) -> Result<(), SetupServiceError> { let dur = std::time::Duration::from_secs(60); @@ -140,13 +168,16 @@ impl ServiceInner { for dataset in datasets { let filesystem_put = || async { info!(self.log, "creating new filesystem: {:?}", dataset); - client.filesystem_put(&dataset.clone().into()) + client + .filesystem_put(&dataset.clone().into()) .await .map_err(BackoffError::transient)?; Ok::< (), BackoffError< - sled_agent_client::Error, + sled_agent_client::Error< + sled_agent_client::types::Error, + >, >, >(()) }; @@ -157,14 +188,15 @@ impl ServiceInner { internal_service_policy(), filesystem_put, log_failure, - ).await?; + ) + .await?; } Ok(()) } async fn initialize_services( &self, - sled_address: std::net::SocketAddr, + sled_address: SocketAddr, services: &Vec, ) -> Result<(), SetupServiceError> { let dur = std::time::Duration::from_secs(60); @@ -181,9 +213,12 @@ impl ServiceInner { info!(self.log, "sending service requests..."); let services_put = || async { info!(self.log, "initializing sled services: {:?}", services); - client.services_put( - &sled_agent_client::types::ServiceEnsureBody { - services: services.iter().map(|s| s.clone().into()).collect() + client + .services_put(&sled_agent_client::types::ServiceEnsureBody { + services: services + .iter() + .map(|s| s.clone().into()) + .collect(), }) .await .map_err(BackoffError::transient)?; @@ -197,11 +232,8 @@ impl ServiceInner { let log_failure = |error, _| { warn!(self.log, "failed to initialize services"; "error" => ?error); }; - retry_notify( - internal_service_policy(), - services_put, - log_failure, - ).await?; + retry_notify(internal_service_policy(), services_put, log_failure) + .await?; Ok(()) } @@ -272,11 +304,21 @@ impl ServiceInner { // Wait until we see enough neighbors to be able to set the // initial set of requests. let mut peer_monitor = self.peer_monitor.lock().await; - while peer_monitor.addrs().await.len() < config.requests.len() { + let our_address = peer_monitor.our_address(); + let mut addrs = peer_monitor.peer_addrs().await; + while addrs.len() + 1 < config.requests.len() { + info!( + self.log, + "# of peers ({}) < # of requests ({}), waiting for more to join...", + addrs.len(), config.requests.len() + ); peer_monitor.recv().await; + addrs = peer_monitor.peer_addrs().await; } + info!(self.log, "Enough peers to start configuring rack: {:?}", addrs); - let peers = peer_monitor.addrs().await.into_iter().enumerate(); + let addrs = + addrs.into_iter().chain([&our_address].into_iter()).enumerate(); // XXX Questions to consider: // - What if a sled comes online *right after* this setup? How does @@ -286,38 +328,52 @@ impl ServiceInner { // is assigning `/64`s based on the order peers have been seen. // Issue the dataset initialization requests to all sleds. - let requests = futures::future::join_all( - config.requests.iter().zip(peers).map(|(request, sled)| async move { - info!(self.log, "observing request: {:#?}", request); - let (idx, bootstrap_addr) = sled; - let sled_subnet_index = u8::try_from(idx + 1).expect("Too many peers!"); - - // First, connect to the Bootstrap Agent and tell it to - // initialize the Sled Agent with the specified subnet. - let subnet = config.sled_subnet(sled_subnet_index); - self.initialize_sled_agent(*bootstrap_addr, subnet).await?; - - // Next, initialize any datasets on sleds that need it. - let sled_address = get_sled_address(Ipv6Net(subnet)); - self.initialize_datasets( - sled_address, - &request.datasets, - ).await?; - Ok((request, sled_address)) - }) - ).await.into_iter().collect::, SetupServiceError>>()?; + let requests = + futures::future::join_all(config.requests.iter().zip(addrs).map( + |(request, sled)| async move { + info!(self.log, "observing request: {:#?}", request); + let (idx, bootstrap_addr) = sled; + let bootstrap_addr = SocketAddrV6::new( + *bootstrap_addr, + BOOTSTRAP_AGENT_PORT, + 0, + 0, + ); + let sled_subnet_index = + u8::try_from(idx + 1).expect("Too many peers!"); + + // First, connect to the Bootstrap Agent and tell it to + // initialize the Sled Agent with the specified subnet. + let subnet = config.sled_subnet(sled_subnet_index); + self.initialize_sled_agent(bootstrap_addr, subnet).await?; + + // Next, initialize any datasets on sleds that need it. + let sled_address = + SocketAddr::V6(get_sled_address(Ipv6Net(subnet))); + self.initialize_datasets(sled_address, &request.datasets) + .await?; + Ok((request, sled_address)) + }, + )) + .await + .into_iter() + .collect::, SetupServiceError>>()?; // Issue service initialization requests. // // Note that this must happen *after* the dataset initialization, // to ensure that CockroachDB has been initialized before Nexus // starts. - futures::future::join_all( - requests.iter().map(|(request, sled_address)| async move { - self.initialize_services(*sled_address, &request.services).await?; + futures::future::join_all(requests.iter().map( + |(request, sled_address)| async move { + self.initialize_services(*sled_address, &request.services) + .await?; Ok(()) - }) - ).await.into_iter().collect::, SetupServiceError>>()?; + }, + )) + .await + .into_iter() + .collect::, SetupServiceError>>()?; // Finally, make sure the configuration is saved so we don't inject // the requests on the next iteration. diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 07fe823866c..baa28a7b7ff 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -8,13 +8,13 @@ use super::config::Config; use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use crate::nexus::NexusClient; -use slog::Drain; -use uuid::Uuid; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; +use slog::Drain; +use std::net::{SocketAddr, SocketAddrV6}; use std::sync::Arc; -use std::net::SocketAddr; +use uuid::Uuid; /// Packages up a [`SledAgent`], running the sled agent API under a Dropshot /// server wired up to the sled agent @@ -29,7 +29,10 @@ impl Server { } /// Starts a SledAgent server - pub async fn start(config: &Config, addr: SocketAddr) -> Result { + pub async fn start( + config: &Config, + addr: SocketAddrV6, + ) -> Result { let (drain, registration) = slog_dtrace::with_drain( config.log.to_logger("sled-agent").map_err(|message| { format!("initializing logger: {}", message) @@ -55,13 +58,14 @@ impl Server { "component" => "SledAgent", "server" => config.id.clone().to_string() )); - let sled_agent = SledAgent::new(&config, sa_log, nexus_client.clone(), addr) - .await - .map_err(|e| e.to_string())?; + let sled_agent = + SledAgent::new(&config, sa_log, nexus_client.clone(), addr) + .await + .map_err(|e| e.to_string())?; let mut dropshot_config = dropshot::ConfigDropshot::default(); dropshot_config.request_body_max_bytes = 1024 * 1024; - dropshot_config.bind_address = addr; + dropshot_config.bind_address = SocketAddr::V6(addr); let dropshot_log = log.new(o!("component" => "dropshot")); let http_server = dropshot::HttpServerStarter::new( &dropshot_config, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 43e4ff8688d..c7a989085ac 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -21,7 +21,7 @@ use omicron_common::api::{ internal::nexus::UpdateArtifact, }; use slog::Logger; -use std::net::SocketAddr; +use std::net::{SocketAddr, SocketAddrV6}; use std::sync::Arc; use uuid::Uuid; @@ -89,7 +89,7 @@ impl SledAgent { config: &Config, log: Logger, nexus_client: Arc, - sled_address: SocketAddr, + sled_address: SocketAddrV6, ) -> Result { let id = &config.id; let vlan = config.vlan; @@ -114,7 +114,8 @@ impl SledAgent { // configuration file. Zones::ensure_has_global_zone_v6_address( config.data_link.clone(), - sled_address.ip(), + *sled_address.ip(), + "sled6", )?; // Identify all existing zones which should be managed by the Sled @@ -177,11 +178,13 @@ impl SledAgent { storage, instances, nexus_client, - services + services, }) } - pub fn id(&self) -> Uuid { self.id } + pub fn id(&self) -> Uuid { + self.id + } /// Ensures that particular services should be initialized. /// diff --git a/sled-agent/tests/test_commands.rs b/sled-agent/tests/integration_tests/commands.rs similarity index 100% rename from sled-agent/tests/test_commands.rs rename to sled-agent/tests/integration_tests/commands.rs diff --git a/sled-agent/tests/integration_tests/mod.rs b/sled-agent/tests/integration_tests/mod.rs new file mode 100644 index 00000000000..6c6686f6543 --- /dev/null +++ b/sled-agent/tests/integration_tests/mod.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +mod commands; +#[cfg(target_os = "illumos")] +mod multicast; diff --git a/sled-agent/tests/integration_tests/multicast.rs b/sled-agent/tests/integration_tests/multicast.rs new file mode 100644 index 00000000000..381f2fc3ab3 --- /dev/null +++ b/sled-agent/tests/integration_tests/multicast.rs @@ -0,0 +1,71 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_sled_agent::bootstrap; +use omicron_sled_agent::illumos::{dladm, zone}; +use std::io; +use std::net::IpAddr; + +#[tokio::test] +async fn test_multicast_bootstrap_address() { + // Setup the bootstrap address. + // + // This modifies global state of the target machine, creating + // an address named "bootstrap6", akin to what the bootstrap + // agent should do. + let link = dladm::Dladm::find_physical().unwrap(); + let address = bootstrap::agent::bootstrap_address(link.clone()).unwrap(); + zone::Zones::ensure_has_global_zone_v6_address( + Some(link), + *address.ip(), + "bootstrap6", + ) + .unwrap(); + + // Create the multicast pair. + let loopback = true; + let interface = 0; + let (sender, listener) = bootstrap::multicast::new_ipv6_udp_pair( + address.ip(), + loopback, + interface, + ) + .unwrap(); + + // Create a receiver task which reads for messages that have + // been broadcast, verifies the message, and returns the + // calling address. + let message = b"Hello World!"; + let receiver_task_handle = tokio::task::spawn(async move { + let mut buf = vec![0u8; 32]; + let (len, addr) = listener.recv_from(&mut buf).await?; + assert_eq!(message.len(), len); + assert_eq!(message, &buf[..message.len()]); + assert_eq!(addr.ip(), IpAddr::V6(*address.ip())); + Ok::<_, io::Error>(addr) + }); + + // Send a message repeatedly, and exit successfully if we + // manage to receive the response. + tokio::pin!(receiver_task_handle); + let mut send_count = 0; + loop { + tokio::select! { + result = sender.send_to(message, bootstrap::multicast::multicast_address()) => { + assert_eq!(message.len(), result.unwrap()); + send_count += 1; + if send_count > 10 { + panic!("10 multicast UDP messages sent with no response"); + } + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + result = &mut receiver_task_handle => { + let addr = result.unwrap().unwrap(); + eprintln!("Receiver received message: {:#?}", addr); + assert_eq!(addr.ip(), IpAddr::V6(*address.ip())); + break; + } + } + } +} diff --git a/sled-agent/tests/mod.rs b/sled-agent/tests/mod.rs new file mode 100644 index 00000000000..325a59bfe25 --- /dev/null +++ b/sled-agent/tests/mod.rs @@ -0,0 +1,17 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Integration tests for the Sled Agent. +//! +//! Why use this weird layer of indirection, you might ask? Cargo chooses to +//! compile *each file* within the "tests/" subdirectory as a separate crate. +//! This means that doing "file-granularity" conditional compilation is +//! difficult, since a file like "test_for_illumos_only.rs" would get compiled +//! and tested regardless of the contents of "mod.rs". +//! +//! However, by lumping all tests into a submodule, all integration tests are +//! joined into a single crate, which itself can filter individual files +//! by (for example) choice of target OS. + +mod integration_tests; diff --git a/smf/sled-agent/config.toml b/smf/sled-agent/config.toml index 027f4fc774f..f96d2072148 100644 --- a/smf/sled-agent/config.toml +++ b/smf/sled-agent/config.toml @@ -2,7 +2,8 @@ id = "fb0f7546-4d46-40ca-9d56-cbb810684ca7" -bootstrap_address = "[::]:12346" +# TODO: Remove this address + # Internal address of Nexus nexus_address = "[fd00:1de::7]:12221" From f2eb4c8b5c641aa283fb8dcb6fb61afa2c38da0f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 11 Apr 2022 22:03:17 -0400 Subject: [PATCH 07/19] Bugfixes: Don't wait for nexus notify, allocate addrs within sled subnet --- sled-agent/src/bootstrap/agent.rs | 2 + sled-agent/src/server.rs | 79 +++++++++++++++++-------------- sled-agent/src/storage_manager.rs | 9 ++++ smf/nexus/config.toml | 8 ++-- smf/oximeter/config.toml | 6 +-- smf/sled-agent/config-rss.toml | 21 ++++---- smf/sled-agent/config.toml | 2 +- 7 files changed, 75 insertions(+), 52 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 66e3ad20c08..0d09d7c20aa 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -183,6 +183,7 @@ impl Agent { let mut maybe_agent = self.sled_agent.lock().await; if let Some(server) = &*maybe_agent { // Server already exists, return it. + info!(&self.log, "Sled Agent already loaded"); return Ok(SledAgentResponse { id: server.id() }); } // Server does not exist, initialize it. @@ -191,6 +192,7 @@ impl Agent { .await .map_err(|e| BootstrapError::SledError(e))?; maybe_agent.replace(server); + info!(&self.log, "Sled Agent loaded; recording configuration"); // Record the subnet, so the sled agent can be automatically // initialized on the next boot. diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index baa28a7b7ff..ab8fc401e3f 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -21,6 +21,7 @@ use uuid::Uuid; pub struct Server { /// Dropshot server for the API. http_server: dropshot::HttpServer, + _nexus_notifier_handle: tokio::task::JoinHandle<()>, } impl Server { @@ -76,42 +77,48 @@ impl Server { .map_err(|error| format!("initializing server: {}", error))? .start(); - // Notify the control plane that we're up, and continue trying this - // until it succeeds. We retry with an randomized, capped exponential - // backoff. - // - // TODO-robustness if this returns a 400 error, we probably want to - // return a permanent error from the `notify_nexus` closure. - let sa_address = http_server.local_addr(); - let notify_nexus = || async { - info!( - log, - "contacting server nexus, registering sled: {}", config.id - ); - nexus_client - .cpapi_sled_agents_post( - &config.id, - &nexus_client::types::SledAgentStartupInfo { - sa_address: sa_address.to_string(), - }, - ) - .await - .map_err(BackoffError::transient) - }; - let log_notification_failure = |_, delay| { - warn!( - log, - "failed to contact nexus, will retry in {:?}", delay; - ); - }; - retry_notify( - internal_service_policy(), - notify_nexus, - log_notification_failure, - ) - .await - .expect("Expected an infinite retry loop contacting Nexus"); - Ok(Server { http_server }) + let sled_address = http_server.local_addr(); + let sled_id = config.id; + let nexus_notifier_handle = tokio::task::spawn(async move { + // Notify the control plane that we're up, and continue trying this + // until it succeeds. We retry with an randomized, capped exponential + // backoff. + // + // TODO-robustness if this returns a 400 error, we probably want to + // return a permanent error from the `notify_nexus` closure. + let notify_nexus = || async { + info!( + log, + "contacting server nexus, registering sled: {}", sled_id + ); + nexus_client + .cpapi_sled_agents_post( + &sled_id, + &nexus_client::types::SledAgentStartupInfo { + sa_address: sled_address.to_string(), + }, + ) + .await + .map_err(BackoffError::transient) + }; + let log_notification_failure = |_, delay| { + warn!( + log, + "failed to contact nexus, will retry in {:?}", delay; + ); + }; + retry_notify( + internal_service_policy(), + notify_nexus, + log_notification_failure, + ) + .await + .expect("Expected an infinite retry loop contacting Nexus"); + }); + Ok(Server { + http_server, + _nexus_notifier_handle: nexus_notifier_handle, + }) } /// Wait for the given server to shut down diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index ab98a688b54..a130e0e6a22 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -208,6 +208,7 @@ impl DatasetInfo { ) -> Result<(), Error> { match self.kind { DatasetKind::CockroachDb { .. } => { + info!(log, "start_zone: Loading CRDB manifest"); // Load the CRDB manifest. zone.run_cmd(&[ crate::illumos::zone::SVCCFG, @@ -216,6 +217,7 @@ impl DatasetInfo { ])?; // Set parameters which are passed to the CRDB binary. + info!(log, "start_zone: setting CRDB's config/listen_addr: {}", address); zone.run_cmd(&[ crate::illumos::zone::SVCCFG, "-s", @@ -223,6 +225,8 @@ impl DatasetInfo { "setprop", &format!("config/listen_addr={}", address), ])?; + + info!(log, "start_zone: setting CRDB's config/store"); zone.run_cmd(&[ crate::illumos::zone::SVCCFG, "-s", @@ -234,6 +238,7 @@ impl DatasetInfo { // // Set these addresses, use "start" instead of // "start-single-node". + info!(log, "start_zone: setting CRDB's config/join_addrs"); zone.run_cmd(&[ crate::illumos::zone::SVCCFG, "-s", @@ -244,6 +249,7 @@ impl DatasetInfo { // Refresh the manifest with the new properties we set, // so they become "effective" properties when the service is enabled. + info!(log, "start_zone: refreshing manifest"); zone.run_cmd(&[ crate::illumos::zone::SVCCFG, "-s", @@ -251,6 +257,7 @@ impl DatasetInfo { "refresh", ])?; + info!(log, "start_zone: enabling CRDB service"); zone.run_cmd(&[ crate::illumos::zone::SVCADM, "enable", @@ -259,6 +266,7 @@ impl DatasetInfo { ])?; // Await liveness of the cluster. + info!(log, "start_zone: awaiting liveness of CRDB"); let check_health = || async { let http_addr = SocketAddr::new(address.ip(), 8080); reqwest::get(format!("http://{}/health?ready=1", http_addr)) @@ -658,6 +666,7 @@ impl StorageWorker { nexus_notifications: &mut FuturesOrdered>>, request: &NewFilesystemRequest, ) -> Result<(), Error> { + info!(self.log, "add_dataset: {:?}", request); let mut pools = self.pools.lock().await; let name = ZpoolName::new(request.zpool_id); let pool = pools.get_mut(&name).ok_or_else(|| { diff --git a/smf/nexus/config.toml b/smf/nexus/config.toml index 02f663f7c94..422ee387b91 100644 --- a/smf/nexus/config.toml +++ b/smf/nexus/config.toml @@ -18,15 +18,15 @@ schemes_external = ["spoof", "session_cookie"] [database] # URL for connecting to the database -url = "postgresql://root@[fd00:1de::5]:32221/omicron?sslmode=disable" +url = "postgresql://root@[fd00:1122:3344:01::5]:32221/omicron?sslmode=disable" [dropshot_external] # IP address and TCP port on which to listen for the external API -bind_address = "[fd00:1de::7]:12220" +bind_address = "[fd00:1122:3344:01::7]:12220" [dropshot_internal] # IP address and TCP port on which to listen for the internal API -bind_address = "[fd00:1de::7]:12221" +bind_address = "[fd00:1122:3344:01::7]:12221" [log] # Show log messages of this level and more severe @@ -42,4 +42,4 @@ mode = "stderr-terminal" # Configuration for interacting with the timeseries database [timeseries_db] -address = "[fd00:1de::8]:8123" +address = "[fd00:1122:3344:01::8]:8123" diff --git a/smf/oximeter/config.toml b/smf/oximeter/config.toml index 78eb1027842..aab7938684e 100644 --- a/smf/oximeter/config.toml +++ b/smf/oximeter/config.toml @@ -2,10 +2,10 @@ id = "1da65e5b-210c-4859-a7d7-200c1e659972" # Internal address of nexus -nexus_address = "[fd00:1de::7]:12221" +nexus_address = "[fd00:1122:3344:01::7]:12221" [db] -address = "[fd00:1de::8]:8123" +address = "[fd00:1122:3344:01::8]:8123" batch_size = 1000 batch_interval = 5 # In seconds @@ -14,4 +14,4 @@ level = "debug" mode = "stderr-terminal" [dropshot] -bind_address = "[fd00:1de::6]:12223" +bind_address = "[fd00:1122:3344:01::6]:12223" diff --git a/smf/sled-agent/config-rss.toml b/smf/sled-agent/config-rss.toml index b4f0b0fbcd6..d6353b4f195 100644 --- a/smf/sled-agent/config-rss.toml +++ b/smf/sled-agent/config-rss.toml @@ -12,40 +12,45 @@ rack_subnet = "fd00:1122:3344:01::" # should allocate crucible datasets. [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1de::9]:32345" +address = "[fd00:1122:3344:01::9]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "e4b4dc87-ab46-49fb-a4b4-d361ae214c03" -address = "[fd00:1de::10]:32345" +address = "[fd00:1122:3344:01::10]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "f4b4dc87-ab46-49fb-a4b4-d361ae214c03" -address = "[fd00:1de::11]:32345" +address = "[fd00:1122:3344:01::11]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1de::5]:32221" +address = "[fd00:1122:3344:01::5]:32221" dataset_kind.type = "cockroach_db" dataset_kind.all_addresses = [ - "[fd00:1de::5]:32221", + "[fd00:1122:3344:01::5]:32221", ] # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus # should allocate clickhouse datasets. [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1de::8]:8123" +address = "[fd00:1122:3344:01::8]:8123" dataset_kind.type = "clickhouse" [[request.service]] name = "nexus" -addresses = [ "[fd00:1de::7]:12220", "[fd00:1de::7]:12221" ] +addresses = [ + "[fd00:1122:3344:01::7]:12220", + "[fd00:1122:3344:01::7]:12221", +] # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus # should allocate Oximeter services. [[request.service]] name = "oximeter" -addresses = [ "[fd00:1de::6]:12223" ] +addresses = [ + "[fd00:1122:3344:01::6]:12223", +] diff --git a/smf/sled-agent/config.toml b/smf/sled-agent/config.toml index f96d2072148..06d812bd197 100644 --- a/smf/sled-agent/config.toml +++ b/smf/sled-agent/config.toml @@ -5,7 +5,7 @@ id = "fb0f7546-4d46-40ca-9d56-cbb810684ca7" # TODO: Remove this address # Internal address of Nexus -nexus_address = "[fd00:1de::7]:12221" +nexus_address = "[fd00:1122:3344:01::7]:12221" # A file-backed zpool can be manually created with the following: # $ truncate -s 10GB testpool.vdev From 8c8bdcd34b9954b55352a475299f93e5b5188001 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 11 Apr 2022 22:28:26 -0400 Subject: [PATCH 08/19] fmt --- sled-agent/src/storage_manager.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index a130e0e6a22..ff09437993e 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -217,7 +217,11 @@ impl DatasetInfo { ])?; // Set parameters which are passed to the CRDB binary. - info!(log, "start_zone: setting CRDB's config/listen_addr: {}", address); + info!( + log, + "start_zone: setting CRDB's config/listen_addr: {}", + address + ); zone.run_cmd(&[ crate::illumos::zone::SVCCFG, "-s", From bac5d72c49cce3781cd7c821776075419cc0f223 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 11 Apr 2022 22:35:15 -0400 Subject: [PATCH 09/19] clippy --- sled-agent/src/illumos/dladm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/illumos/dladm.rs b/sled-agent/src/illumos/dladm.rs index eeb70358428..92c1ed03916 100644 --- a/sled-agent/src/illumos/dladm.rs +++ b/sled-agent/src/illumos/dladm.rs @@ -79,7 +79,7 @@ impl Dladm { // Ensure the MAC address is zero-padded, so it may be parsed as a // MacAddr. This converts segments like ":a" to ":0a". let name = name - .split(":") + .split(':') .map(|segment| format!("{:0>2}", segment)) .collect::>() .join(":"); From 4abf0114542f071de89fcd54ae897c2cac7575c6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Apr 2022 09:59:53 -0400 Subject: [PATCH 10/19] Update docs, addresses --- docs/how-to-run.adoc | 22 ++++++++++++---------- smf/nexus/config.toml | 8 ++++---- smf/oximeter/config.toml | 6 +++--- smf/sled-agent/config-rss.toml | 20 ++++++++++---------- smf/sled-agent/config.toml | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 11505406a89..79d93595e17 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -95,19 +95,21 @@ we'll assign addresses as per RFD 63 as well as incorporating DNS based service discovery. For the purposes of local development today, we specify some hardcoded IPv6 -unique local addresses in `fd00:1de::/16`: +unique local addresses in the subnet of the first Sled Agent: `fd00:1122:3344:1::/64`: [options="header"] |=================================================================================================== -| Service | Endpoint -| Sled Agent: Bootstrap | `[::]:12346` -| Sled Agent: Dropshot API | `[fd00:1de::]:12345` -| Cockroach DB | `[fd00:1de::5]:32221` -| Oximeter | `[fd00:1de::6]:12223` -| Nexus: External API | `[fd00:1de::7]:12220` -| Nexus: Internal API | `[fd00:1de::7]:12221` -| Clickhouse | `[fd00:1de::8]:8123` -| Crucible Downstairs | `[fd00:1de::9]:32345`, `[fd00:1de::10]:32345`, `[fd00:1de::11]:32345` +| Service | Endpoint +| Sled Agent: Bootstrap | Derived from MAC address of physical data link. +| Sled Agent: Dropshot API | `[fd00:1122:3344:1::1]:12345` +| Cockroach DB | `[fd00:1122:3344:1::2]:32221` +| Nexus: External API | `[fd00:1122:3344:1::3]:12220` +| Nexus: Internal API | `[fd00:1122:3344:1::3]:12221` +| Oximeter | `[fd00:1122:3344:1::4]:12223` +| Clickhouse | `[fd00:1122:3344:1::5]:8123` +| Crucible Downstairs 1 | `[fd00:1122:3344:1::6]:32345` +| Crucible Downstairs 2 | `[fd00:1122:3344:1::7]:32345` +| Crucible Downstairs 3 | `[fd00:1122:3344:1::8]:32345` |=================================================================================================== Note that Sled Agent runs in the global zone and is the one responsible for bringing up all the other diff --git a/smf/nexus/config.toml b/smf/nexus/config.toml index 422ee387b91..d4030efc9d4 100644 --- a/smf/nexus/config.toml +++ b/smf/nexus/config.toml @@ -18,15 +18,15 @@ schemes_external = ["spoof", "session_cookie"] [database] # URL for connecting to the database -url = "postgresql://root@[fd00:1122:3344:01::5]:32221/omicron?sslmode=disable" +url = "postgresql://root@[fd00:1122:3344:1::2]:32221/omicron?sslmode=disable" [dropshot_external] # IP address and TCP port on which to listen for the external API -bind_address = "[fd00:1122:3344:01::7]:12220" +bind_address = "[fd00:1122:3344:1::3]:12220" [dropshot_internal] # IP address and TCP port on which to listen for the internal API -bind_address = "[fd00:1122:3344:01::7]:12221" +bind_address = "[fd00:1122:3344:1::3]:12221" [log] # Show log messages of this level and more severe @@ -42,4 +42,4 @@ mode = "stderr-terminal" # Configuration for interacting with the timeseries database [timeseries_db] -address = "[fd00:1122:3344:01::8]:8123" +address = "[fd00:1122:3344:1::5]:8123" diff --git a/smf/oximeter/config.toml b/smf/oximeter/config.toml index aab7938684e..a4812d01fd1 100644 --- a/smf/oximeter/config.toml +++ b/smf/oximeter/config.toml @@ -2,10 +2,10 @@ id = "1da65e5b-210c-4859-a7d7-200c1e659972" # Internal address of nexus -nexus_address = "[fd00:1122:3344:01::7]:12221" +nexus_address = "[fd00:1122:3344:1::3]:12221" [db] -address = "[fd00:1122:3344:01::8]:8123" +address = "[fd00:1122:3344:1::5]:8123" batch_size = 1000 batch_interval = 5 # In seconds @@ -14,4 +14,4 @@ level = "debug" mode = "stderr-terminal" [dropshot] -bind_address = "[fd00:1122:3344:01::6]:12223" +bind_address = "[fd00:1122:3344:1::4]:12223" diff --git a/smf/sled-agent/config-rss.toml b/smf/sled-agent/config-rss.toml index d6353b4f195..ad8993c1ae3 100644 --- a/smf/sled-agent/config-rss.toml +++ b/smf/sled-agent/config-rss.toml @@ -4,7 +4,7 @@ # Also implies the /48 AZ subnet. # |............| <- This /48 is the AZ Subnet # |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:01::" +rack_subnet = "fd00:1122:3344:1::" [[request]] @@ -12,39 +12,39 @@ rack_subnet = "fd00:1122:3344:01::" # should allocate crucible datasets. [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1122:3344:01::9]:32345" +address = "[fd00:1122:3344:1::6]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "e4b4dc87-ab46-49fb-a4b4-d361ae214c03" -address = "[fd00:1122:3344:01::10]:32345" +address = "[fd00:1122:3344:1::7]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "f4b4dc87-ab46-49fb-a4b4-d361ae214c03" -address = "[fd00:1122:3344:01::11]:32345" +address = "[fd00:1122:3344:1::8]:32345" dataset_kind.type = "crucible" [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1122:3344:01::5]:32221" +address = "[fd00:1122:3344:1::2]:32221" dataset_kind.type = "cockroach_db" dataset_kind.all_addresses = [ - "[fd00:1122:3344:01::5]:32221", + "[fd00:1122:3344:1::2]:32221", ] # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus # should allocate clickhouse datasets. [[request.dataset]] zpool_uuid = "d462a7f7-b628-40fe-80ff-4e4189e2d62b" -address = "[fd00:1122:3344:01::8]:8123" +address = "[fd00:1122:3344:1::5]:8123" dataset_kind.type = "clickhouse" [[request.service]] name = "nexus" addresses = [ - "[fd00:1122:3344:01::7]:12220", - "[fd00:1122:3344:01::7]:12221", + "[fd00:1122:3344:1::3]:12220", + "[fd00:1122:3344:1::3]:12221", ] # TODO(https://github.com/oxidecomputer/omicron/issues/732): Nexus @@ -52,5 +52,5 @@ addresses = [ [[request.service]] name = "oximeter" addresses = [ - "[fd00:1122:3344:01::6]:12223", + "[fd00:1122:3344:1::4]:12223", ] diff --git a/smf/sled-agent/config.toml b/smf/sled-agent/config.toml index 06d812bd197..6dfe87fe9bf 100644 --- a/smf/sled-agent/config.toml +++ b/smf/sled-agent/config.toml @@ -5,7 +5,7 @@ id = "fb0f7546-4d46-40ca-9d56-cbb810684ca7" # TODO: Remove this address # Internal address of Nexus -nexus_address = "[fd00:1122:3344:01::7]:12221" +nexus_address = "[fd00:1122:3344:01::3]:12221" # A file-backed zpool can be manually created with the following: # $ truncate -s 10GB testpool.vdev From 60b9e48561976f60f7b3bf0a02c01e9ba25ecae5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Apr 2022 14:28:17 -0400 Subject: [PATCH 11/19] Make RSS fault-tolerant --- openapi/bootstrap-agent.json | 10 +- sled-agent/src/bootstrap/agent.rs | 2 - sled-agent/src/bootstrap/mod.rs | 2 +- sled-agent/src/bootstrap/params.rs | 6 +- sled-agent/src/rack_setup/config.rs | 2 - sled-agent/src/rack_setup/service.rs | 400 +++++++++++++++++++-------- 6 files changed, 284 insertions(+), 138 deletions(-) diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index dec4c7e6e50..7cb4833dcfa 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -156,22 +156,16 @@ "type": "object", "properties": { "ip": { - "description": "Portion of the IP space to be managed by the Sled Agent.", + "description": "ID of the Sled to be initialized. Portion of the IP space to be managed by the Sled Agent.", "allOf": [ { "$ref": "#/components/schemas/Ipv6Net" } ] - }, - "uuid": { - "description": "ID of the Sled to be initialized.", - "type": "string", - "format": "uuid" } }, "required": [ - "ip", - "uuid" + "ip" ] }, "SledAgentResponse": { diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 0d09d7c20aa..60fddf2c4b2 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -178,8 +178,6 @@ impl Agent { request: SledAgentRequest, ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); - // TODO: actually use request.uuid - let mut maybe_agent = self.sled_agent.lock().await; if let Some(server) = &*maybe_agent { // Server already exists, return it. diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index fd1c317d338..da4df934d77 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -10,7 +10,7 @@ pub mod config; pub mod discovery; mod http_entrypoints; pub mod multicast; -mod params; +pub(crate) mod params; pub mod server; mod spdm; pub mod trust_quorum; diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index d439bdadba4..7386521728d 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -7,7 +7,6 @@ use omicron_common::api::external::Ipv6Net; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use uuid::Uuid; /// Identity signed by local RoT and Oxide certificate chain. #[derive(Serialize, Deserialize, JsonSchema)] @@ -17,11 +16,8 @@ pub struct ShareRequest { } /// Configuration information for launching a Sled Agent. -#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct SledAgentRequest { - /// ID of the Sled to be initialized. - pub uuid: Uuid, - /// Portion of the IP space to be managed by the Sled Agent. pub ip: Ipv6Net, } diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index e036ace1b8f..64981c78cdf 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -60,8 +60,6 @@ impl SetupServiceConfig { let mut rack_network = self.rack_subnet().network().octets(); // To set bits distinguishing the /64 from the /56, we modify the 7th octet. - // - // 0001:0203:0405:0607:: rack_network[7] = index; ipnetwork::Ipv6Network::new(std::net::Ipv6Addr::from(rack_network), 64) .unwrap() diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 9213bb9b686..99aa5f2f95b 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -4,18 +4,20 @@ //! Rack Setup Service implementation -use super::config::SetupServiceConfig as Config; -use crate::bootstrap::discovery::PeerMonitorObserver; +use super::config::{SetupServiceConfig as Config, SledRequest}; use crate::bootstrap::{ client as bootstrap_agent_client, config::BOOTSTRAP_AGENT_PORT, + discovery::PeerMonitorObserver, params::SledAgentRequest, }; use crate::config::get_sled_address; use omicron_common::api::external::Ipv6Net; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; +use serde::{Deserialize, Serialize}; use slog::Logger; -use std::net::{SocketAddr, SocketAddrV6}; +use std::collections::{HashMap, HashSet}; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use thiserror::Error; use tokio::sync::Mutex; @@ -44,6 +46,13 @@ pub enum SetupServiceError { Configuration, } +// The workload / information allocated to a single sled. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +struct SledAllocation { + initialization_request: SledAgentRequest, + services_request: SledRequest, +} + /// The interface to the Rack Setup Service. pub struct Service { handle: tokio::task::JoinHandle>, @@ -63,8 +72,13 @@ impl Service { peer_monitor: PeerMonitorObserver, ) -> Self { let handle = tokio::task::spawn(async move { - let svc = ServiceInner::new(log, peer_monitor); - svc.inject_rack_setup_requests(&config).await + let svc = ServiceInner::new(log.clone(), peer_monitor); + if let Err(e) = svc.inject_rack_setup_requests(&config).await { + warn!(log, "RSS injection failed: {}", e); + Err(e) + } else { + Ok(()) + } }); Service { handle } @@ -76,6 +90,34 @@ impl Service { } } +fn rss_plan_path() -> std::path::PathBuf { + std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join("rss-plan.toml") +} + +fn rss_completed_plan_path() -> std::path::PathBuf { + std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join("rss-plan-completed.toml") +} + +// Describes the options when awaiting for peers. +enum PeerExpectation { + // Await a set of peers that matches this group of IPv6 addresses exactly. + // + // TODO: We currently don't deal with the case where: + // + // - RSS boots, sees some sleds, comes up with a plan. + // - RSS reboots, sees a *different* set of sleds, and needs + // to adjust the plan. + // + // This case is fairly tricky because some sleds may have + // already received requests to initialize - modifying the + // allocated subnets would be non-trivial. + Precise(HashSet), + // Await any peers, as long as there are at least enough to make a new plan. + Arbitrary(usize), +} + /// The implementation of the Rack Setup Service. struct ServiceInner { log: Logger, @@ -90,7 +132,7 @@ impl ServiceInner { async fn initialize_sled_agent( &self, bootstrap_addr: SocketAddrV6, - subnet: ipnetwork::Ipv6Network, + request: &SledAgentRequest, ) -> Result<(), SetupServiceError> { let dur = std::time::Duration::from_secs(60); @@ -99,13 +141,7 @@ impl ServiceInner { .timeout(dur) .build()?; - // TODO: Can we just use a type that avoids the need for this - // conversion? - let url = format!( - "http://[{}]:{}", - bootstrap_addr.ip(), - BOOTSTRAP_AGENT_PORT, - ); + let url = format!("http://{}", bootstrap_addr); info!(self.log, "Sending request to peer agent: {}", url); let client = bootstrap_agent_client::Client::new_with_client( &url, @@ -116,9 +152,8 @@ impl ServiceInner { let sled_agent_initialize = || async { client .start_sled(&bootstrap_agent_client::types::SledAgentRequest { - uuid: uuid::Uuid::new_v4(), // TODO: not rando ip: bootstrap_agent_client::types::Ipv6Net( - subnet.to_string(), + request.ip.0.to_string(), ), }) .await @@ -237,137 +272,255 @@ impl ServiceInner { Ok(()) } + async fn load_plan( + &self, + ) -> Result>, SetupServiceError> + { + // If we already created a plan for this RSS to allocate + // subnets/requests to sleds, re-use that existing plan. + let rss_plan_path = rss_plan_path(); + if rss_plan_path.exists() { + info!(self.log, "RSS plan already created, loading from file"); + + let plan: std::collections::HashMap = + toml::from_str( + &tokio::fs::read_to_string(&rss_plan_path).await?, + )?; + Ok(Some(plan)) + } else { + Ok(None) + } + } + + async fn create_plan( + &self, + config: &Config, + addrs: impl IntoIterator, + ) -> Result, SetupServiceError> { + let addrs = addrs.into_iter().enumerate(); + + // TODO: The use of "zip" here means that if we have more addrs than + // requests, we won't initialize some of them. Maybe that's okay? + // Maybe that's the responsibility of Nexus? + let requests_and_sleds = config.requests.iter().zip(addrs); + + let allocations = requests_and_sleds.map(|(request, sled)| { + let (idx, bootstrap_addr) = sled; + info!( + self.log, + "Creating plan for the sled at {:?}", bootstrap_addr + ); + let bootstrap_addr = + SocketAddrV6::new(bootstrap_addr, BOOTSTRAP_AGENT_PORT, 0, 0); + let sled_subnet_index = + u8::try_from(idx + 1).expect("Too many peers!"); + let subnet = config.sled_subnet(sled_subnet_index); + + ( + bootstrap_addr, + SledAllocation { + initialization_request: SledAgentRequest { + ip: Ipv6Net(subnet), + }, + services_request: request.clone(), + }, + ) + }); + + info!(self.log, "Serializing plan"); + + let mut plan = std::collections::HashMap::new(); + for (addr, allocation) in allocations { + plan.insert(addr, allocation); + } + + // Once we've constructed a plan, write it down to durable storage. + let serialized_plan = toml::Value::try_from(&plan) + .expect("Cannot serialize configuration"); + let plan_str = toml::to_string(&serialized_plan) + .expect("Cannot turn config to string"); + + info!(self.log, "Plan serialized as: {}", plan_str); + tokio::fs::write(&rss_plan_path(), plan_str).await?; + info!(self.log, "Plan written to storage"); + + Ok(plan) + } + + // Waits for sufficient neighbors to exist so the initial set of requests + // can be send out. + async fn wait_for_peers( + &self, + expectation: PeerExpectation, + ) -> Result, SetupServiceError> { + let mut peer_monitor = self.peer_monitor.lock().await; + let our_address = peer_monitor.our_address(); + + // TODO: We could likely optimize this, avoid re-making the sets + loop { + { + let peers = peer_monitor.peer_addrs().await; + let all_addrs = peers.iter().chain([&our_address].into_iter()); + match expectation { + PeerExpectation::Precise(ref expected) => { + let addr_set = all_addrs + .map(|a| *a) + .collect::>(); + if addr_set.is_superset(expected) { + return Ok(addr_set + .into_iter() + .collect::>()); + } + info!(self.log, "Waiting for a precise set of peers; not found yet."); + } + PeerExpectation::Arbitrary(count) => { + if peers.len() + 1 >= count { + return Ok(all_addrs + .map(|a| *a) + .collect::>()); + } + info!( + self.log, + "Waiting for {} peers (currently have {})", + count, + peers.len() + 1 + ); + } + } + } + + info!(self.log, "Waiting for more peers"); + peer_monitor.recv().await; + } + } + // In lieu of having an operator send requests to all sleds via an // initialization service, the sled-agent configuration may allow for the // automated injection of setup requests from a sled. + // + // This method has a few distinct phases, identified by files in durable + // storage: + // + // 1. ALLOCATION PLAN CREATION. When the RSS starts up for the first time, + // it creates an allocation plan to provision subnets and services + // to an initial set of sleds. + // + // This plan is stored at "rss_plan_path()". + // + // 2. ALLOCATION PLAN EXECUTION. The RSS then carries out this plan, making + // requests to the sleds enumerated within the "allocation plan". + // + // 3. MARKING SETUP COMPLETE. Once the RSS has successfully initialized the + // rack, the "rss_plan_path()" file is renamed to + // "rss_completed_plan_path()". This indicates that the plan executed + // successfully, and no work remains. async fn inject_rack_setup_requests( &self, config: &Config, ) -> Result<(), SetupServiceError> { info!(self.log, "Injecting RSS configuration: {:#?}", config); - let serialized_config = toml::Value::try_from(&config) - .expect("Cannot serialize configuration"); - let config_str = toml::to_string(&serialized_config) - .expect("Cannot turn config to string"); + // We expect this directory to exist - ensure that it does, before any + // subsequent operations which may write configs here. + tokio::fs::create_dir_all(omicron_common::OMICRON_CONFIG_PATH).await?; - // First, check if this request has previously been made. - // - // Normally, the rack setup service is run with a human-in-the-loop, - // but with this automated injection, we need a way to determine the - // (destructive) initialization has occurred. + // Check if a previous RSS plan has completed successfully. // - // We do this by storing the configuration at "rss_config_path" - // after successfully performing initialization. - let rss_config_path = - std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join("config-rss.toml"); - if rss_config_path.exists() { + // If it has, the system should be up-and-running. + let rss_completed_plan_path = rss_completed_plan_path(); + if rss_completed_plan_path.exists() { + // TODO(https://github.com/oxidecomputer/omicron/issues/724): If the + // running configuration doesn't match Config, we could try to + // update things. info!( self.log, - "RSS configuration already exists at {}", - rss_config_path.to_string_lossy() - ); - let old_config: Config = toml::from_str( - &tokio::fs::read_to_string(&rss_config_path).await?, - )?; - if &old_config == config { - info!( - self.log, - "RSS config already applied from: {}", - rss_config_path.to_string_lossy() - ); - return Ok(()); - } - - // TODO(https://github.com/oxidecomputer/omicron/issues/724): - // We could potentially handle this case by deleting all - // datasets (in preparation for applying the new - // configuration), but at the moment it's an error. - warn!( - self.log, - "Rack Setup Service Config ({}) was already applied, but has changed. - This means that you may have datasets set up on this sled, but they - may not match the ones requested by the supplied configuration.\n - To re-initialize this sled, re-run 'omicron-package install'.", - rss_config_path.to_string_lossy() + "RSS configuration looks like it has already been applied", ); - return Err(SetupServiceError::Configuration); + return Ok(()); } else { - info!( - self.log, - "No RSS configuration found at {}", - rss_config_path.to_string_lossy() - ); + info!(self.log, "RSS configuration has not been fully applied yet",); } - // Wait until we see enough neighbors to be able to set the - // initial set of requests. - let mut peer_monitor = self.peer_monitor.lock().await; - let our_address = peer_monitor.our_address(); - let mut addrs = peer_monitor.peer_addrs().await; - while addrs.len() + 1 < config.requests.len() { - info!( - self.log, - "# of peers ({}) < # of requests ({}), waiting for more to join...", - addrs.len(), config.requests.len() - ); - peer_monitor.recv().await; - addrs = peer_monitor.peer_addrs().await; - } - info!(self.log, "Enough peers to start configuring rack: {:?}", addrs); + // Wait for either: + // - All the peers to re-load an old plan (if one exists) + // - Enough peers to create a new plan (if one does not exist) + let maybe_plan = self.load_plan().await?; + let expectation = if let Some(plan) = &maybe_plan { + PeerExpectation::Precise(plan.keys().map(|a| *a.ip()).collect()) + } else { + PeerExpectation::Arbitrary(config.requests.len()) + }; + let addrs = self.wait_for_peers(expectation).await?; + info!(self.log, "Enough peers exist to enact RSS plan"); - let addrs = - addrs.into_iter().chain([&our_address].into_iter()).enumerate(); + // If we created a plan, reuse it. Otherwise, create a new plan. + let plan = if let Some(plan) = maybe_plan { + info!(self.log, "Re-using existing allocation plan"); + plan + } else { + info!(self.log, "Creating new allocation plan"); + self.create_plan(config, addrs).await? + }; - // XXX Questions to consider: - // - What if a sled comes online *right after* this setup? How does - // it get a /64? - // - What is the RSS fails *after* telling a BA to start a SA? - // How can it reconcile that lost address? The current scheme - // is assigning `/64`s based on the order peers have been seen. + // NOTE: This is a "point-of-no-return" -- before sending any requests + // to neighboring sleds, ensure that we've recorded our plan to durable + // storage. This way, if the RSS power-cycles, it can idempotently + // execute the same allocation plan. // Issue the dataset initialization requests to all sleds. - let requests = - futures::future::join_all(config.requests.iter().zip(addrs).map( - |(request, sled)| async move { - info!(self.log, "observing request: {:#?}", request); - let (idx, bootstrap_addr) = sled; - let bootstrap_addr = SocketAddrV6::new( - *bootstrap_addr, - BOOTSTRAP_AGENT_PORT, - 0, - 0, - ); - let sled_subnet_index = - u8::try_from(idx + 1).expect("Too many peers!"); - - // First, connect to the Bootstrap Agent and tell it to - // initialize the Sled Agent with the specified subnet. - let subnet = config.sled_subnet(sled_subnet_index); - self.initialize_sled_agent(bootstrap_addr, subnet).await?; - - // Next, initialize any datasets on sleds that need it. - let sled_address = - SocketAddr::V6(get_sled_address(Ipv6Net(subnet))); - self.initialize_datasets(sled_address, &request.datasets) - .await?; - Ok((request, sled_address)) - }, - )) - .await - .into_iter() - .collect::, SetupServiceError>>()?; + futures::future::join_all(plan.iter().map( + |(bootstrap_addr, allocation)| async move { + info!( + self.log, + "Sending request: {:#?}", allocation.initialization_request + ); + + // First, connect to the Bootstrap Agent and tell it to + // initialize the Sled Agent with the specified subnet. + self.initialize_sled_agent( + *bootstrap_addr, + &allocation.initialization_request, + ) + .await?; + info!( + self.log, + "Initialized sled agent on sled with bootstrap address: {}", + bootstrap_addr + ); + + // Next, initialize any datasets on sleds that need it. + let sled_address = SocketAddr::V6(get_sled_address( + allocation.initialization_request.ip, + )); + self.initialize_datasets( + sled_address, + &allocation.services_request.datasets, + ) + .await?; + Ok(()) + }, + )) + .await + .into_iter() + .collect::>()?; + + info!(self.log, "Finished setting up agents and datasets"); // Issue service initialization requests. // // Note that this must happen *after* the dataset initialization, // to ensure that CockroachDB has been initialized before Nexus // starts. - futures::future::join_all(requests.iter().map( - |(request, sled_address)| async move { - self.initialize_services(*sled_address, &request.services) - .await?; + futures::future::join_all(plan.iter().map( + |(_, allocation)| async move { + let sled_address = SocketAddr::V6(get_sled_address( + allocation.initialization_request.ip, + )); + self.initialize_services( + sled_address, + &allocation.services_request.services, + ) + .await?; Ok(()) }, )) @@ -375,9 +528,16 @@ impl ServiceInner { .into_iter() .collect::, SetupServiceError>>()?; + info!(self.log, "Finished setting up services"); + // Finally, make sure the configuration is saved so we don't inject // the requests on the next iteration. - tokio::fs::write(rss_config_path, config_str).await?; + tokio::fs::rename(rss_plan_path(), rss_completed_plan_path).await?; + + // TODO Questions to consider: + // - What if a sled comes online *right after* this setup? How does + // it get a /64? + Ok(()) } } From e89f444044098c029d8a8b69936545dbdd04c35c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 Apr 2022 14:57:21 -0400 Subject: [PATCH 12/19] Simplify peer monitor --- sled-agent/src/bootstrap/agent.rs | 14 ++++- sled-agent/src/bootstrap/discovery.rs | 91 +++------------------------ sled-agent/src/rack_setup/service.rs | 29 +++++---- sled-agent/src/server.rs | 4 ++ 4 files changed, 41 insertions(+), 97 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 60fddf2c4b2..897393ed915 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -178,14 +178,26 @@ impl Agent { request: SledAgentRequest, ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); + + let sled_address = crate::config::get_sled_address(request.ip); + let mut maybe_agent = self.sled_agent.lock().await; if let Some(server) = &*maybe_agent { // Server already exists, return it. info!(&self.log, "Sled Agent already loaded"); + + if &server.address().ip() != sled_address.ip() { + let err_str = format!( + "Sled Agent already running on address {}, but {} was requested", + server.address().ip(), + sled_address.ip(), + ); + return Err(BootstrapError::SledError(err_str)); + } + return Ok(SledAgentResponse { id: server.id() }); } // Server does not exist, initialize it. - let sled_address = crate::config::get_sled_address(request.ip); let server = SledServer::start(&self.sled_config, sled_address) .await .map_err(|e| BootstrapError::SledError(e))?; diff --git a/sled-agent/src/bootstrap/discovery.rs b/sled-agent/src/bootstrap/discovery.rs index 4cb15d75196..384b8fd027e 100644 --- a/sled-agent/src/bootstrap/discovery.rs +++ b/sled-agent/src/bootstrap/discovery.rs @@ -131,22 +131,10 @@ impl PeerMonitor { /// Returns a [`PeerMonitorObserver`] which can be used to view the results /// of monitoring for peers. pub async fn observer(&self) -> PeerMonitorObserver { - // Subscribe for notifications of new sleds right away, so - // we won't miss any notifications. - let receiver = self.notification_sender.subscribe(); - - // Next, clone the exisitng set of sleds. - // - // It's possible that we get a notification for a sled which - // exists in this set, but we handle that in - // [`PeerMonitorObserver::recv`] to avoid surfacing it to a client. - let sleds = self.sleds.lock().await.clone(); - PeerMonitorObserver { our_address: self.our_address, actual_sleds: self.sleds.clone(), - observed_sleds: sleds, - receiver, + sender: self.notification_sender.clone(), } } } @@ -160,11 +148,7 @@ pub struct PeerMonitorObserver { // This is only used to re-synchronize our set of sleds // if we get out-of-sync due to long notification queues. actual_sleds: Arc>>, - // A local copy of the set of sleds. This lets observers - // access + iterate over the set of sleds directly, - // without any possibility of blocking the actual monitoring task. - observed_sleds: HashSet, - receiver: broadcast::Receiver, + sender: broadcast::Sender, } impl PeerMonitorObserver { @@ -173,68 +157,13 @@ impl PeerMonitorObserver { self.our_address } - /// Returns the addresses of all connected sleds, excluding - /// our own. - /// - /// This returns the most "up-to-date" view of peers, but a new - /// peer may be added immediately after this function returns. - /// - /// To monitor for changes, a call to [`Self::recv`] - /// can be made, to observe changes beyond an initial call to - /// [`Self::peer_addrs`]. - pub async fn peer_addrs(&mut self) -> &HashSet { - // First, drain the incoming queue of sled updates. - loop { - match self.receiver.try_recv() { - Ok(new_addr) => { - self.observed_sleds.insert(new_addr); - } - Err(broadcast::error::TryRecvError::Empty) => break, - Err(broadcast::error::TryRecvError::Closed) => { - panic!("Remote closed") - } - Err(broadcast::error::TryRecvError::Lagged(_)) => { - self.observed_sleds = - self.actual_sleds.lock().await.clone(); - } - } - } - while let Ok(new_addr) = self.receiver.try_recv() { - self.observed_sleds.insert(new_addr); - } - - // Next, return the most up-to-date set of sleds. - // - // Note that this set may change immediately after `peer_addrs()` returns, - // but a caller can see exactly what sleds were added by calling - // `recv()`. - &self.observed_sleds - } - - /// Returns information about a new connected sled. - /// - /// Note that this does not provide the "initial set" of connected - /// sleds - to access that information, call [`Self::peer_addrs`]. - /// - /// Returns [`Option::None`] if the notification queue overflowed, - /// and we needed to re-synchronize the set of sleds. - pub async fn recv(&mut self) -> Option { - loop { - match self.receiver.recv().await { - Ok(new_addr) => { - if self.observed_sleds.insert(new_addr) { - return Some(new_addr); - } - } - Err(broadcast::error::RecvError::Closed) => { - panic!("Remote closed") - } - Err(broadcast::error::RecvError::Lagged(_)) => { - self.observed_sleds = - self.actual_sleds.lock().await.clone(); - return None; - } - } - } + /// Returns the current set of sleds and a receiver to hear about + /// new ones. + pub async fn subscribe( + &mut self, + ) -> (HashSet, broadcast::Receiver) { + let sleds = self.actual_sleds.lock().await; + let receiver = self.sender.subscribe(); + (sleds.clone(), receiver) } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 99aa5f2f95b..8aeceb526d2 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -39,6 +39,9 @@ pub enum SetupServiceError { #[error("Cannot deserialize TOML file")] Toml(#[from] toml::de::Error), + #[error("Failed to monitor for peers: {0}")] + PeerMonitor(#[from] tokio::sync::broadcast::error::RecvError), + #[error(transparent)] Http(#[from] reqwest::Error), @@ -354,43 +357,39 @@ impl ServiceInner { expectation: PeerExpectation, ) -> Result, SetupServiceError> { let mut peer_monitor = self.peer_monitor.lock().await; - let our_address = peer_monitor.our_address(); + let (mut all_addrs, mut peer_rx) = peer_monitor.subscribe().await; + all_addrs.insert(peer_monitor.our_address()); - // TODO: We could likely optimize this, avoid re-making the sets loop { { - let peers = peer_monitor.peer_addrs().await; - let all_addrs = peers.iter().chain([&our_address].into_iter()); match expectation { PeerExpectation::Precise(ref expected) => { - let addr_set = all_addrs - .map(|a| *a) - .collect::>(); - if addr_set.is_superset(expected) { - return Ok(addr_set + if all_addrs.is_superset(expected) { + return Ok(all_addrs .into_iter() .collect::>()); } info!(self.log, "Waiting for a precise set of peers; not found yet."); } - PeerExpectation::Arbitrary(count) => { - if peers.len() + 1 >= count { + PeerExpectation::Arbitrary(wanted_peer_count) => { + if all_addrs.len() >= wanted_peer_count { return Ok(all_addrs - .map(|a| *a) + .into_iter() .collect::>()); } info!( self.log, "Waiting for {} peers (currently have {})", - count, - peers.len() + 1 + wanted_peer_count, + all_addrs.len(), ); } } } info!(self.log, "Waiting for more peers"); - peer_monitor.recv().await; + let new_peer = peer_rx.recv().await?; + all_addrs.insert(new_peer); } } diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index ab8fc401e3f..00c141a0358 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -25,6 +25,10 @@ pub struct Server { } impl Server { + pub fn address(&self) -> SocketAddr { + self.http_server.local_addr() + } + pub fn id(&self) -> Uuid { self.http_server.app_private().id() } From 05ef31be827a3d646c83f69505c5f80349747131 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 Apr 2022 16:18:03 -0400 Subject: [PATCH 13/19] openapi --- openapi/bootstrap-agent.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 7cb4833dcfa..99e35d38727 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -156,7 +156,7 @@ "type": "object", "properties": { "ip": { - "description": "ID of the Sled to be initialized. Portion of the IP space to be managed by the Sled Agent.", + "description": "Portion of the IP space to be managed by the Sled Agent.", "allOf": [ { "$ref": "#/components/schemas/Ipv6Net" From 55ab2365f705763e1427cdd5042278c3981aa0fc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 18 Apr 2022 15:00:20 -0400 Subject: [PATCH 14/19] better subnet masking, tests, deal with IP mismatch --- sled-agent/src/illumos/addrobj.rs | 8 ++-- sled-agent/src/illumos/zone.rs | 37 ++++++++++++--- sled-agent/src/rack_setup/config.rs | 73 +++++++++++++++++++++++++---- 3 files changed, 100 insertions(+), 18 deletions(-) diff --git a/sled-agent/src/illumos/addrobj.rs b/sled-agent/src/illumos/addrobj.rs index 46ac2a5933c..80f41fd9010 100644 --- a/sled-agent/src/illumos/addrobj.rs +++ b/sled-agent/src/illumos/addrobj.rs @@ -13,7 +13,7 @@ /// ^ ^ /// | | AddrObject name /// | Interface name -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub struct AddrObject { interface: String, name: String, @@ -50,8 +50,8 @@ impl AddrObject { } } -impl ToString for AddrObject { - fn to_string(&self) -> String { - format!("{}/{}", self.interface, self.name) +impl std::fmt::Display for AddrObject { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}/{}", self.interface, self.name) } } diff --git a/sled-agent/src/illumos/zone.rs b/sled-agent/src/illumos/zone.rs index bec47e1b818..c667f851de4 100644 --- a/sled-agent/src/illumos/zone.rs +++ b/sled-agent/src/illumos/zone.rs @@ -66,9 +66,6 @@ pub enum Error { #[error("Error accessing filesystem: {0}")] Filesystem(std::io::Error), - #[error("Unexpected IP address: {0}")] - Ip(IpNetwork), - #[error("Value not found")] NotFound, } @@ -268,7 +265,10 @@ impl Zones { .ok_or(Error::NotFound) } - /// Gets the address if one exists, creates one if one does not exist. + /// Ensures that an IP address on an interface matches the requested value. + /// + /// - If the address exists, ensure it has the desired value. + /// - If the address does not exist, create it. /// /// This address may be optionally within a zone `zone`. /// If `None` is supplied, the address is queried from the Global Zone. @@ -281,8 +281,13 @@ impl Zones { match Self::get_address(zone, addrobj) { Ok(addr) => { if let AddressRequest::Static(expected_addr) = addrtype { + // If the address is static, we need to validate that it + // matches the value we asked for. if addr != expected_addr { - return Err(Error::Ip(addr)); + // If the address doesn't match, try removing the old + // value before using the new one. + Self::delete_address(zone, addrobj)?; + return Self::create_address(zone, addrobj, addrtype); } } Ok(addr) @@ -357,7 +362,6 @@ impl Zones { addrobj: &AddrObject, addrtype: AddressRequest, ) -> Result<(), Error> { - // No link-local address was found, attempt to make one. let mut command = std::process::Command::new(PFEXEC); let mut args = vec![]; if let Some(zone) = zone { @@ -388,6 +392,27 @@ impl Zones { Ok(()) } + #[allow(clippy::needless_lifetimes)] + fn delete_address<'a>( + zone: Option<&'a str>, + addrobj: &AddrObject, + ) -> Result<(), Error> { + let mut command = std::process::Command::new(PFEXEC); + let mut args = vec![]; + if let Some(zone) = zone { + args.push(ZLOGIN.to_string()); + args.push(zone.to_string()); + }; + + args.push(IPADM.to_string()); + args.push("delete-addr".to_string()); + args.push(addrobj.to_string()); + + let cmd = command.args(args); + execute(cmd)?; + Ok(()) + } + // Ensures a link local IPv6 exists for the object. // // This is necessary for allocating IPv6 addresses on illumos. diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 64981c78cdf..130acbab1e9 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -6,8 +6,10 @@ use crate::config::ConfigError; use crate::params::{DatasetEnsureBody, ServiceRequest}; +use ipnetwork::Ipv6Network; use serde::Deserialize; use serde::Serialize; +use std::net::Ipv6Addr; use std::path::Path; /// Configuration for the "rack setup service", which is controlled during @@ -22,7 +24,7 @@ use std::path::Path; /// can act as a stand-in initialization service. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct SetupServiceConfig { - pub rack_subnet: std::net::Ipv6Addr, + pub rack_subnet: Ipv6Addr, #[serde(default, rename = "request")] pub requests: Vec, @@ -40,6 +42,18 @@ pub struct SledRequest { pub services: Vec, } +fn new_network(addr: Ipv6Addr, prefix: u8) -> Ipv6Network { + let net = Ipv6Network::new(addr, prefix).unwrap(); + + // ipnetwork inputs/outputs the provided IPv6 address, unmodified by the + // prefix. We manually mask `addr` based on `prefix` ourselves. + Ipv6Network::new( + Ipv6Addr::from(u128::from(addr) & u128::from(net.mask())), + prefix, + ) + .unwrap() +} + impl SetupServiceConfig { pub fn from_file>(path: P) -> Result { let path = path.as_ref(); @@ -48,20 +62,63 @@ impl SetupServiceConfig { Ok(config) } - pub fn az_subnet(&self) -> ipnetwork::Ipv6Network { - ipnetwork::Ipv6Network::new(self.rack_subnet, 48).unwrap() + pub fn az_subnet(&self) -> Ipv6Network { + new_network(self.rack_subnet, 48) } - pub fn rack_subnet(&self) -> ipnetwork::Ipv6Network { - ipnetwork::Ipv6Network::new(self.rack_subnet, 56).unwrap() + pub fn rack_subnet(&self) -> Ipv6Network { + new_network(self.rack_subnet, 56) } - pub fn sled_subnet(&self, index: u8) -> ipnetwork::Ipv6Network { + pub fn sled_subnet(&self, index: u8) -> Ipv6Network { let mut rack_network = self.rack_subnet().network().octets(); // To set bits distinguishing the /64 from the /56, we modify the 7th octet. rack_network[7] = index; - ipnetwork::Ipv6Network::new(std::net::Ipv6Addr::from(rack_network), 64) - .unwrap() + Ipv6Network::new(Ipv6Addr::from(rack_network), 64).unwrap() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_subnets() { + let cfg = SetupServiceConfig { + rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + requests: vec![], + }; + + assert_eq!( + // Masked out in AZ Subnet + // vv + "fd00:1122:3344:0000::/48".parse::().unwrap(), + cfg.az_subnet() + ); + assert_eq!( + // Shows up from Rack Subnet + // vv + "fd00:1122:3344:0100::/56".parse::().unwrap(), + cfg.rack_subnet() + ); + assert_eq!( + // 0th Sled Subnet + // vv + "fd00:1122:3344:0100::/64".parse::().unwrap(), + cfg.sled_subnet(0) + ); + assert_eq!( + // 1st Sled Subnet + // vv + "fd00:1122:3344:0101::/64".parse::().unwrap(), + cfg.sled_subnet(1) + ); + assert_eq!( + // Last Sled Subnet + // vv + "fd00:1122:3344:01ff::/64".parse::().unwrap(), + cfg.sled_subnet(255) + ); } } From e6c64eb96ee14d8630023c115799115d0b8a25dd Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 18 Apr 2022 15:28:27 -0400 Subject: [PATCH 15/19] Cleanup docs, test cleanup --- sled-agent/src/illumos/zone.rs | 2 +- sled-agent/src/rack_setup/service.rs | 10 +++++----- .../tests/integration_tests/multicast.rs | 20 +++++++++++++++++-- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/sled-agent/src/illumos/zone.rs b/sled-agent/src/illumos/zone.rs index c667f851de4..ceb7cb5c57c 100644 --- a/sled-agent/src/illumos/zone.rs +++ b/sled-agent/src/illumos/zone.rs @@ -393,7 +393,7 @@ impl Zones { } #[allow(clippy::needless_lifetimes)] - fn delete_address<'a>( + pub fn delete_address<'a>( zone: Option<&'a str>, addrobj: &AddrObject, ) -> Result<(), Error> { diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 8aeceb526d2..c8c4b7d2d2d 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -453,6 +453,11 @@ impl ServiceInner { info!(self.log, "Enough peers exist to enact RSS plan"); // If we created a plan, reuse it. Otherwise, create a new plan. + // + // NOTE: This is a "point-of-no-return" -- before sending any requests + // to neighboring sleds, the plan must be recorded to durable storage. + // This way, if the RSS power-cycles, it can idempotently execute the + // same allocation plan. let plan = if let Some(plan) = maybe_plan { info!(self.log, "Re-using existing allocation plan"); plan @@ -461,11 +466,6 @@ impl ServiceInner { self.create_plan(config, addrs).await? }; - // NOTE: This is a "point-of-no-return" -- before sending any requests - // to neighboring sleds, ensure that we've recorded our plan to durable - // storage. This way, if the RSS power-cycles, it can idempotently - // execute the same allocation plan. - // Issue the dataset initialization requests to all sleds. futures::future::join_all(plan.iter().map( |(bootstrap_addr, allocation)| async move { diff --git a/sled-agent/tests/integration_tests/multicast.rs b/sled-agent/tests/integration_tests/multicast.rs index 381f2fc3ab3..1aa9f8b103f 100644 --- a/sled-agent/tests/integration_tests/multicast.rs +++ b/sled-agent/tests/integration_tests/multicast.rs @@ -3,26 +3,42 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use omicron_sled_agent::bootstrap; +use omicron_sled_agent::illumos::addrobj::AddrObject; use omicron_sled_agent::illumos::{dladm, zone}; use std::io; use std::net::IpAddr; +struct AddressCleanup { + addrobj: AddrObject, +} + +impl Drop for AddressCleanup { + fn drop(&mut self) { + let _ = zone::Zones::delete_address(None, &self.addrobj); + } +} + #[tokio::test] async fn test_multicast_bootstrap_address() { // Setup the bootstrap address. // // This modifies global state of the target machine, creating - // an address named "bootstrap6", akin to what the bootstrap + // an address named "testbootstrap6", akin to what the bootstrap // agent should do. let link = dladm::Dladm::find_physical().unwrap(); let address = bootstrap::agent::bootstrap_address(link.clone()).unwrap(); + let address_name = "testbootstrap6"; + let addrobj = AddrObject::new(&link.0, address_name).unwrap(); zone::Zones::ensure_has_global_zone_v6_address( Some(link), *address.ip(), - "bootstrap6", + address_name, ) .unwrap(); + // Cleanup-on-drop removal of the bootstrap address. + let _cleanup = AddressCleanup { addrobj }; + // Create the multicast pair. let loopback = true; let interface = 0; From c02e773a275bf6814dbd43041aa6293316894473 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 Apr 2022 10:31:06 -0400 Subject: [PATCH 16/19] SledSubnet types, rename peer expectations, better ipnetwork helper usage --- common/src/api/external/mod.rs | 6 ++++ openapi/bootstrap-agent.json | 14 ++++++-- sled-agent/src/bootstrap/agent.rs | 9 +++-- sled-agent/src/bootstrap/params.rs | 53 +++++++++++++++++++++++++++- sled-agent/src/config.rs | 4 +-- sled-agent/src/rack_setup/config.rs | 6 +--- sled-agent/src/rack_setup/service.rs | 34 +++++++++--------- 7 files changed, 96 insertions(+), 30 deletions(-) diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 72433768241..8ecbf493db0 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1072,6 +1072,12 @@ impl std::fmt::Display for Ipv6Net { } } +impl From for Ipv6Net { + fn from(n: ipnetwork::Ipv6Network) -> Ipv6Net { + Self(n) + } +} + impl JsonSchema for Ipv6Net { fn schema_name() -> String { "Ipv6Net".to_string() diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 99e35d38727..cf04477f449 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -155,17 +155,17 @@ "description": "Configuration information for launching a Sled Agent.", "type": "object", "properties": { - "ip": { + "subnet": { "description": "Portion of the IP space to be managed by the Sled Agent.", "allOf": [ { - "$ref": "#/components/schemas/Ipv6Net" + "$ref": "#/components/schemas/SledSubnet" } ] } }, "required": [ - "ip" + "subnet" ] }, "SledAgentResponse": { @@ -180,6 +180,14 @@ "required": [ "id" ] + }, + "SledSubnet": { + "description": "Represents subnets belonging to Sleds.\n\nThis is a thin wrapper around the [`Ipv6Net`] type - which may be accessed by [`AsRef`] - which adds additional validation that this is a /64 subnet with an expected prefix.", + "allOf": [ + { + "$ref": "#/components/schemas/Ipv6Net" + } + ] } } } diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 897393ed915..25801f31315 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -28,6 +28,8 @@ use std::path::{Path, PathBuf}; use thiserror::Error; use tokio::sync::Mutex; +pub(crate) const SLED_SUBNET_SEGMENT0: u16 = 0xFDB0; + /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { @@ -102,7 +104,7 @@ fn mac_to_socket_addr(mac: MacAddr) -> SocketAddrV6 { assert_eq!(6, mac_bytes.len()); let address = Ipv6Addr::new( - 0xfdb0, + SLED_SUBNET_SEGMENT0, ((mac_bytes[0] as u16) << 8) | mac_bytes[1] as u16, ((mac_bytes[2] as u16) << 8) | mac_bytes[3] as u16, ((mac_bytes[4] as u16) << 8) | mac_bytes[5] as u16, @@ -179,7 +181,8 @@ impl Agent { ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); - let sled_address = crate::config::get_sled_address(request.ip); + let sled_address = + crate::config::get_sled_address(*request.subnet.as_ref()); let mut maybe_agent = self.sled_agent.lock().await; if let Some(server) = &*maybe_agent { @@ -209,7 +212,7 @@ impl Agent { tokio::fs::write( get_subnet_path(), &toml::to_string( - &toml::Value::try_from(&request.ip) + &toml::Value::try_from(&request.subnet) .expect("Cannot serialize IP"), ) .expect("Cannot convert toml to string"), diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 7386521728d..e0e190f3560 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -15,9 +15,60 @@ pub struct ShareRequest { pub identity: Vec, } +#[derive(thiserror::Error, Debug)] +pub enum SubnetError { + #[error("Subnet has unexpected prefix length: {0}")] + BadPrefixLength(u8), + + #[error("Subnet has unexpected prefix value: {0}")] + BadPrefixValue(Ipv6Net), +} + +/// Represents subnets belonging to Sleds. +/// +/// This is a thin wrapper around the [`Ipv6Net`] type - which may be accessed +/// by [`AsRef`] - which adds additional validation that this is a /64 +/// subnet with an expected prefix. +// Note: The inner field is intentionally non-pub; this makes it +// more difficult to construct a sled subnet which avoids the +// validation performed by the constructor. +#[derive(Clone, Debug, Serialize, JsonSchema, PartialEq)] +pub struct SledSubnet(Ipv6Net); + +impl SledSubnet { + pub fn new(ip: Ipv6Net) -> Result { + let prefix = ip.0.prefix(); + if prefix != 64 { + return Err(SubnetError::BadPrefixLength(prefix)); + } + if ip.0.ip().segments()[0] + != crate::bootstrap::agent::SLED_SUBNET_SEGMENT0 + { + return Err(SubnetError::BadPrefixValue(ip)); + } + Ok(SledSubnet(ip)) + } +} + +impl<'de> serde::Deserialize<'de> for SledSubnet { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let net = Ipv6Net::deserialize(deserializer)?; + SledSubnet::new(net).map_err(serde::de::Error::custom) + } +} + +impl AsRef for SledSubnet { + fn as_ref(&self) -> &Ipv6Net { + &self.0 + } +} + /// Configuration information for launching a Sled Agent. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct SledAgentRequest { /// Portion of the IP space to be managed by the Sled Agent. - pub ip: Ipv6Net, + pub subnet: SledSubnet, } diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index ea0b4db9ead..eb5c3f5da00 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -10,7 +10,7 @@ use crate::illumos::zpool::ZpoolName; use dropshot::ConfigLogging; use omicron_common::api::external::Ipv6Net; use serde::Deserialize; -use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; +use std::net::{SocketAddr, SocketAddrV6}; use std::path::Path; use uuid::Uuid; @@ -18,7 +18,7 @@ pub const SLED_AGENT_PORT: u16 = 12345; /// Given a subnet, return the sled agent address. pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddrV6 { - let sled_agent_ip = Ipv6Addr::from(u128::from(subnet.ip()) + 1); + let sled_agent_ip = subnet.iter().next().unwrap(); SocketAddrV6::new(sled_agent_ip, SLED_AGENT_PORT, 0, 0) } diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 130acbab1e9..53545b28984 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -47,11 +47,7 @@ fn new_network(addr: Ipv6Addr, prefix: u8) -> Ipv6Network { // ipnetwork inputs/outputs the provided IPv6 address, unmodified by the // prefix. We manually mask `addr` based on `prefix` ourselves. - Ipv6Network::new( - Ipv6Addr::from(u128::from(addr) & u128::from(net.mask())), - prefix, - ) - .unwrap() + Ipv6Network::new(net.network(), prefix).unwrap() } impl SetupServiceConfig { diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index c8c4b7d2d2d..c3de81fd451 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -8,9 +8,9 @@ use super::config::{SetupServiceConfig as Config, SledRequest}; use crate::bootstrap::{ client as bootstrap_agent_client, config::BOOTSTRAP_AGENT_PORT, discovery::PeerMonitorObserver, params::SledAgentRequest, + params::SledSubnet, }; use crate::config::get_sled_address; -use omicron_common::api::external::Ipv6Net; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; @@ -116,9 +116,9 @@ enum PeerExpectation { // This case is fairly tricky because some sleds may have // already received requests to initialize - modifying the // allocated subnets would be non-trivial. - Precise(HashSet), + LoadOldPlan(HashSet), // Await any peers, as long as there are at least enough to make a new plan. - Arbitrary(usize), + CreateNewPlan(usize), } /// The implementation of the Rack Setup Service. @@ -155,8 +155,10 @@ impl ServiceInner { let sled_agent_initialize = || async { client .start_sled(&bootstrap_agent_client::types::SledAgentRequest { - ip: bootstrap_agent_client::types::Ipv6Net( - request.ip.0.to_string(), + subnet: bootstrap_agent_client::types::SledSubnet( + bootstrap_agent_client::types::Ipv6Net( + request.subnet.as_ref().to_string(), + ), ), }) .await @@ -317,14 +319,14 @@ impl ServiceInner { SocketAddrV6::new(bootstrap_addr, BOOTSTRAP_AGENT_PORT, 0, 0); let sled_subnet_index = u8::try_from(idx + 1).expect("Too many peers!"); - let subnet = config.sled_subnet(sled_subnet_index); + let subnet = + SledSubnet::new(config.sled_subnet(sled_subnet_index).into()) + .expect("Created Invalid Subnet"); ( bootstrap_addr, SledAllocation { - initialization_request: SledAgentRequest { - ip: Ipv6Net(subnet), - }, + initialization_request: SledAgentRequest { subnet }, services_request: request.clone(), }, ) @@ -363,15 +365,15 @@ impl ServiceInner { loop { { match expectation { - PeerExpectation::Precise(ref expected) => { + PeerExpectation::LoadOldPlan(ref expected) => { if all_addrs.is_superset(expected) { return Ok(all_addrs .into_iter() .collect::>()); } - info!(self.log, "Waiting for a precise set of peers; not found yet."); + info!(self.log, "Waiting for a LoadOldPlan set of peers; not found yet."); } - PeerExpectation::Arbitrary(wanted_peer_count) => { + PeerExpectation::CreateNewPlan(wanted_peer_count) => { if all_addrs.len() >= wanted_peer_count { return Ok(all_addrs .into_iter() @@ -445,9 +447,9 @@ impl ServiceInner { // - Enough peers to create a new plan (if one does not exist) let maybe_plan = self.load_plan().await?; let expectation = if let Some(plan) = &maybe_plan { - PeerExpectation::Precise(plan.keys().map(|a| *a.ip()).collect()) + PeerExpectation::LoadOldPlan(plan.keys().map(|a| *a.ip()).collect()) } else { - PeerExpectation::Arbitrary(config.requests.len()) + PeerExpectation::CreateNewPlan(config.requests.len()) }; let addrs = self.wait_for_peers(expectation).await?; info!(self.log, "Enough peers exist to enact RSS plan"); @@ -489,7 +491,7 @@ impl ServiceInner { // Next, initialize any datasets on sleds that need it. let sled_address = SocketAddr::V6(get_sled_address( - allocation.initialization_request.ip, + *allocation.initialization_request.subnet.as_ref(), )); self.initialize_datasets( sled_address, @@ -513,7 +515,7 @@ impl ServiceInner { futures::future::join_all(plan.iter().map( |(_, allocation)| async move { let sled_address = SocketAddr::V6(get_sled_address( - allocation.initialization_request.ip, + *allocation.initialization_request.subnet.as_ref(), )); self.initialize_services( sled_address, From f64a51ff81da08046777884736a34b2bdb842acc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 Apr 2022 10:42:11 -0400 Subject: [PATCH 17/19] oops, don't check for bootstrap prefix on sled subnet --- sled-agent/src/bootstrap/agent.rs | 4 +--- sled-agent/src/bootstrap/params.rs | 8 -------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 25801f31315..5171c30a647 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -28,8 +28,6 @@ use std::path::{Path, PathBuf}; use thiserror::Error; use tokio::sync::Mutex; -pub(crate) const SLED_SUBNET_SEGMENT0: u16 = 0xFDB0; - /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { @@ -104,7 +102,7 @@ fn mac_to_socket_addr(mac: MacAddr) -> SocketAddrV6 { assert_eq!(6, mac_bytes.len()); let address = Ipv6Addr::new( - SLED_SUBNET_SEGMENT0, + 0xfdb0, ((mac_bytes[0] as u16) << 8) | mac_bytes[1] as u16, ((mac_bytes[2] as u16) << 8) | mac_bytes[3] as u16, ((mac_bytes[4] as u16) << 8) | mac_bytes[5] as u16, diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index e0e190f3560..b6c55bb1479 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -19,9 +19,6 @@ pub struct ShareRequest { pub enum SubnetError { #[error("Subnet has unexpected prefix length: {0}")] BadPrefixLength(u8), - - #[error("Subnet has unexpected prefix value: {0}")] - BadPrefixValue(Ipv6Net), } /// Represents subnets belonging to Sleds. @@ -41,11 +38,6 @@ impl SledSubnet { if prefix != 64 { return Err(SubnetError::BadPrefixLength(prefix)); } - if ip.0.ip().segments()[0] - != crate::bootstrap::agent::SLED_SUBNET_SEGMENT0 - { - return Err(SubnetError::BadPrefixValue(ip)); - } Ok(SledSubnet(ip)) } } From 0add3331457b4ed6f77c038537950651f801d1d8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 Apr 2022 11:07:29 -0400 Subject: [PATCH 18/19] Reference to #945 --- sled-agent/src/bootstrap/agent.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 5171c30a647..8e3ed304b94 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -115,6 +115,8 @@ fn mac_to_socket_addr(mac: MacAddr) -> SocketAddrV6 { SocketAddrV6::new(address, BOOTSTRAP_AGENT_PORT, 0, 0) } +// TODO(https://github.com/oxidecomputer/omicron/issues/945): This address +// could be randomly generated when it no longer needs to be durable. pub fn bootstrap_address( link: PhysicalLink, ) -> Result { From ca6771c8ea94c630d116d30167d12107c3ee7fc5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 Apr 2022 13:50:14 -0400 Subject: [PATCH 19/19] oops, stop trying to allocate the anycast address --- sled-agent/src/config.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index eb5c3f5da00..b02fbdabcd9 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -18,7 +18,9 @@ pub const SLED_AGENT_PORT: u16 = 12345; /// Given a subnet, return the sled agent address. pub(crate) fn get_sled_address(subnet: Ipv6Net) -> SocketAddrV6 { - let sled_agent_ip = subnet.iter().next().unwrap(); + let mut iter = subnet.iter(); + let _anycast_ip = iter.next().unwrap(); + let sled_agent_ip = iter.next().unwrap(); SocketAddrV6::new(sled_agent_ip, SLED_AGENT_PORT, 0, 0) }