diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index e93b18161c5..c83b56274c8 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -13,7 +13,7 @@ use super::trust_quorum::{ use super::views::{ShareResponse, SledAgentResponse}; use crate::config::Config as SledConfig; use crate::illumos::dladm::{self, Dladm, PhysicalLink}; -use crate::illumos::zone::{self, Zones}; +use crate::illumos::zone::Zones; use crate::rack_setup::service::Service as RackSetupService; use crate::server::Server as SledServer; use omicron_common::address::get_sled_address; @@ -32,26 +32,24 @@ use tokio::sync::Mutex; /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { - #[error("Error accessing filesystem: {0}")] - Io(#[from] std::io::Error), - - #[error("Error configuring SMF: {0}")] - SmfConfig(#[from] smf::ConfigError), - - #[error("Error modifying SMF service: {0}")] - SmfAdm(#[from] smf::AdmError), + #[error("IO error: {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, #[error("Error starting sled agent: {0}")] SledError(String), - #[error(transparent)] - Toml(#[from] toml::de::Error), + #[error("Error deserializing toml from {path}: {err}")] + Toml { path: PathBuf, err: toml::de::Error }, #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), - #[error(transparent)] - Zone(#[from] zone::Error), + #[error("Failed to initialize bootstrap address: {err}")] + BootstrapAddress { err: crate::illumos::zone::EnsureGzAddressError }, } impl From for ExternalError { @@ -71,11 +69,11 @@ fn read_key_share() -> Result, BootstrapError> { match ShareDistribution::read(&key_share_dir) { Ok(share) => Ok(Some(share)), - Err(TrustQuorumError::Io(err)) => { + Err(TrustQuorumError::Io { message, err }) => { if err.kind() == io::ErrorKind::NotFound { Ok(None) } else { - Err(BootstrapError::Io(err)) + Err(BootstrapError::Io { message, err }) } } Err(e) => Err(e.into()), @@ -121,7 +119,7 @@ fn mac_to_socket_addr(mac: MacAddr) -> SocketAddrV6 { // could be randomly generated when it no longer needs to be durable. pub fn bootstrap_address( link: PhysicalLink, -) -> Result { +) -> Result { let mac = Dladm::get_mac(link)?; Ok(mac_to_socket_addr(mac)) } @@ -132,13 +130,31 @@ impl Agent { sled_config: SledConfig, address: Ipv6Addr, ) -> Result { + let data_link = if let Some(link) = sled_config.data_link.clone() { + link + } else { + Dladm::find_physical().map_err(|err| { + BootstrapError::SledError(format!( + "Can't access physical link, and none in config: {}", + err + )) + })? + }; + Zones::ensure_has_global_zone_v6_address( - sled_config.data_link.clone(), + data_link, address, "bootstrap6", - )?; + ) + .map_err(|err| BootstrapError::BootstrapAddress { err })?; - let peer_monitor = discovery::PeerMonitor::new(&log, address)?; + let peer_monitor = + discovery::PeerMonitor::new(&log, address).map_err(|err| { + BootstrapError::Io { + message: format!("Monitoring for peers from {address}"), + err, + } + })?; let share = read_key_share()?; let agent = Agent { log, @@ -153,8 +169,16 @@ impl Agent { if request_path.exists() { info!(agent.log, "Sled already configured, loading sled agent"); let sled_request: SledAgentRequest = toml::from_str( - &tokio::fs::read_to_string(&request_path).await?, - )?; + &tokio::fs::read_to_string(&request_path).await.map_err( + |err| BootstrapError::Io { + message: format!( + "Reading subnet path from {request_path:?}" + ), + err, + }, + )?, + ) + .map_err(|err| BootstrapError::Toml { path: request_path, err })?; agent.request_agent(sled_request).await?; } @@ -204,21 +228,30 @@ impl Agent { // Server does not exist, initialize it. let server = SledServer::start(&self.sled_config, sled_address) .await - .map_err(|e| BootstrapError::SledError(e))?; + .map_err(|e| { + BootstrapError::SledError(format!( + "Could not start sled agent server: {e}" + )) + })?; maybe_agent.replace(server); info!(&self.log, "Sled Agent loaded; recording configuration"); // Record this request so the sled agent can be automatically // initialized on the next boot. + let path = get_sled_agent_request_path(); tokio::fs::write( - get_sled_agent_request_path(), + &path, &toml::to_string( &toml::Value::try_from(&request) .expect("Cannot serialize request"), ) .expect("Cannot convert toml to string"), ) - .await?; + .await + .map_err(|err| BootstrapError::Io { + message: format!("Recording Sled Agent request to {path:?}"), + err, + })?; Ok(SledAgentResponse { id: self.sled_config.id }) } @@ -325,7 +358,11 @@ impl Agent { async fn run_trust_quorum_server(&self) -> Result<(), BootstrapError> { let my_share = self.share.as_ref().unwrap().share.clone(); - let mut server = trust_quorum::Server::new(&self.log, my_share)?; + let mut server = trust_quorum::Server::new(&self.log, my_share) + .map_err(|err| BootstrapError::Io { + message: "Cannot run trust quorum server".to_string(), + err, + })?; tokio::spawn(async move { server.run().await }); Ok(()) } diff --git a/sled-agent/src/bootstrap/trust_quorum/client.rs b/sled-agent/src/bootstrap/trust_quorum/client.rs index 7eb1ff2808b..0d6cdaf2c1d 100644 --- a/sled-agent/src/bootstrap/trust_quorum/client.rs +++ b/sled-agent/src/bootstrap/trust_quorum/client.rs @@ -31,7 +31,12 @@ impl Client { // Connect to a trust quorum server, establish an SPDM channel, and retrieve // a share. pub async fn get_share(&self) -> Result { - let sock = TcpStream::connect(&self.addr).await?; + let sock = TcpStream::connect(&self.addr).await.map_err(|err| { + TrustQuorumError::Io { + message: format!("Connecting to {}", self.addr), + err, + } + })?; let transport = spdm::Transport::new(sock, self.log.clone()); // Complete SPDM negotiation and return a secure transport diff --git a/sled-agent/src/bootstrap/trust_quorum/error.rs b/sled-agent/src/bootstrap/trust_quorum/error.rs index 968e7ee9a25..69c98bc6c31 100644 --- a/sled-agent/src/bootstrap/trust_quorum/error.rs +++ b/sled-agent/src/bootstrap/trust_quorum/error.rs @@ -29,6 +29,10 @@ pub enum TrustQuorumError { #[error("Rack secret construction failed: {0:?}")] RackSecretConstructionFailed(vsss_rs::Error), - #[error("IO error: {0}")] - Io(#[from] std::io::Error), + #[error("IO error {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, } diff --git a/sled-agent/src/bootstrap/trust_quorum/server.rs b/sled-agent/src/bootstrap/trust_quorum/server.rs index 6dfbe4206bb..9016bc7e9e1 100644 --- a/sled-agent/src/bootstrap/trust_quorum/server.rs +++ b/sled-agent/src/bootstrap/trust_quorum/server.rs @@ -65,7 +65,12 @@ impl Server { &mut self, ) -> Result>, TrustQuorumError> { - let (sock, addr) = self.listener.accept().await?; + let (sock, addr) = self.listener.accept().await.map_err(|err| { + TrustQuorumError::Io { + message: "Accepting a connection from TCP listener".to_string(), + err, + } + })?; debug!(self.log, "Accepted connection from {}", addr); let share = self.share.clone(); let log = self.log.clone(); diff --git a/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs index 73dc2148ee3..799e94fc5d7 100644 --- a/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs +++ b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs @@ -32,7 +32,10 @@ impl ShareDistribution { let mut path = PathBuf::from(dir.as_ref()); path.push(FILENAME); let json = serde_json::to_string(&self)?; - fs::write(path, &json)?; + fs::write(&path, &json).map_err(|err| TrustQuorumError::Io { + message: format!("Writing share to {path:?}"), + err, + })?; Ok(()) } @@ -41,7 +44,13 @@ impl ShareDistribution { ) -> Result { let mut path = PathBuf::from(dir.as_ref()); path.push(FILENAME); - let json = fs::read_to_string(path.to_str().unwrap())?; + let json = + fs::read_to_string(path.to_str().unwrap()).map_err(|err| { + TrustQuorumError::Io { + message: format!("Reading share from {path:?}"), + err, + } + })?; serde_json::from_str(&json).map_err(|e| e.into()) } } diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index ca332b2c07c..cfa6e84927b 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -10,7 +10,7 @@ use crate::illumos::zpool::ZpoolName; use dropshot::ConfigLogging; use serde::Deserialize; use std::net::SocketAddr; -use std::path::Path; +use std::path::{Path, PathBuf}; use uuid::Uuid; /// Configuration for a sled agent @@ -35,21 +35,33 @@ pub struct Config { #[derive(Debug, thiserror::Error)] pub enum ConfigError { - #[error("Failed to read config: {0}")] - Io(#[from] std::io::Error), - #[error("Failed to parse config: {0}")] - Parse(#[from] toml::de::Error), + #[error("Failed to read config from {path}: {err}")] + Io { + path: PathBuf, + #[source] + err: std::io::Error, + }, + #[error("Failed to parse config from {path}: {err}")] + Parse { + path: PathBuf, + #[source] + err: toml::de::Error, + }, } impl Config { pub fn from_file>(path: P) -> Result { let path = path.as_ref(); - let contents = std::fs::read_to_string(path)?; - let config = toml::from_str(&contents)?; + let contents = std::fs::read_to_string(&path) + .map_err(|err| ConfigError::Io { path: path.into(), err })?; + let config = toml::from_str(&contents) + .map_err(|err| ConfigError::Parse { path: path.into(), err })?; Ok(config) } - pub fn get_link(&self) -> Result { + pub fn get_link( + &self, + ) -> Result { let link = if let Some(link) = self.data_link.clone() { link } else { diff --git a/sled-agent/src/illumos/addrobj.rs b/sled-agent/src/illumos/addrobj.rs index 80f41fd9010..5b1d3668da3 100644 --- a/sled-agent/src/illumos/addrobj.rs +++ b/sled-agent/src/illumos/addrobj.rs @@ -19,28 +19,48 @@ pub struct AddrObject { name: String, } +#[derive(Debug, PartialEq, Clone)] +enum BadName { + Interface(String), + Object(String), +} + +impl std::fmt::Display for BadName { + fn fmt( + &self, + f: &mut std::fmt::Formatter<'_>, + ) -> Result<(), std::fmt::Error> { + match self { + BadName::Interface(s) => write!(f, "Bad interface name: {}", s), + BadName::Object(s) => write!(f, "Bad object name: {}", s), + } + } +} + /// Errors which may be returned from constructing an [`AddrObject`]. #[derive(Debug, thiserror::Error)] -pub enum Error { - #[error("Failed to parse addrobj name: {0}")] - Parse(String), +#[error("Failed to parse addrobj name: {name}")] +pub struct ParseError { + name: BadName, } impl AddrObject { - pub fn new_control(interface: &str) -> Result { + pub fn new_control(interface: &str) -> Result { Self::new(interface, "omicron") } - pub fn on_same_interface(&self, name: &str) -> Result { + pub fn on_same_interface(&self, name: &str) -> Result { Self::new(&self.interface, name) } - pub fn new(interface: &str, name: &str) -> Result { + pub fn new(interface: &str, name: &str) -> Result { if interface.contains('/') { - return Err(Error::Parse(interface.to_string())); + return Err(ParseError { + name: BadName::Interface(interface.to_string()), + }); } if name.contains('/') { - return Err(Error::Parse(name.to_string())); + return Err(ParseError { name: BadName::Object(name.to_string()) }); } Ok(Self { interface: interface.to_string(), name: name.to_string() }) } diff --git a/sled-agent/src/illumos/dladm.rs b/sled-agent/src/illumos/dladm.rs index 92c1ed03916..6bf9c182414 100644 --- a/sled-agent/src/illumos/dladm.rs +++ b/sled-agent/src/illumos/dladm.rs @@ -15,21 +15,56 @@ pub const VNIC_PREFIX_CONTROL: &str = "oxControl"; pub const DLADM: &str = "/usr/sbin/dladm"; +/// Errors returned from [`Dladm::find_physical`]. #[derive(thiserror::Error, Debug)] -pub enum Error { - #[error("Device not found")] - NotFound, - - #[error("Subcommand failure: {0}")] +pub enum FindPhysicalLinkError { + #[error("Failed to find physical link: {0}")] Execution(#[from] ExecutionError), - #[error("Failed to parse output: {0}")] - Parse(#[from] std::string::FromUtf8Error), + #[error("No Physical Link devices found")] + NoPhysicalLinkFound, +} + +/// Errors returned from [`Dladm::get_mac`]. +#[derive(thiserror::Error, Debug)] +pub enum GetMacError { + #[error("Mac Address cannot be looked up; Link not found: {0:?}")] + NotFound(PhysicalLink), + + #[error("Failed to get MAC address: {0}")] + Execution(#[from] ExecutionError), #[error("Failed to parse MAC: {0}")] ParseMac(#[from] macaddr::ParseError), } +/// Errors returned from [`Dladm::create_vnic`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to create VNIC {name} on link {link:?}: {err}")] +pub struct CreateVnicError { + name: String, + link: PhysicalLink, + #[source] + err: ExecutionError, +} + +/// Errors returned from [`Dladm::get_vnics`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to get vnics: {err}")] +pub struct GetVnicError { + #[source] + err: ExecutionError, +} + +/// Errors returned from [`Dladm::delete_vnic`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to delete vnic {name}: {err}")] +pub struct DeleteVnicError { + name: String, + #[source] + err: ExecutionError, +} + /// The name of a physical datalink. #[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct PhysicalLink(pub String); @@ -40,24 +75,24 @@ pub struct Dladm {} #[cfg_attr(test, mockall::automock, allow(dead_code))] impl Dladm { /// Returns the name of the first observed physical data link. - pub fn find_physical() -> Result { + pub fn find_physical() -> Result { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[DLADM, "show-phys", "-p", "-o", "LINK"]); let output = execute(cmd)?; - let name = String::from_utf8(output.stdout)? + let name = String::from_utf8_lossy(&output.stdout) .lines() // TODO: This is arbitrary, but we're currently grabbing the first // physical device. Should we have a more sophisticated method for // selection? .next() .map(|s| s.trim()) - .ok_or_else(|| Error::NotFound)? + .ok_or_else(|| FindPhysicalLinkError::NoPhysicalLinkFound)? .to_string(); Ok(PhysicalLink(name)) } /// Returns the MAC address of a physical link. - pub fn get_mac(link: PhysicalLink) -> Result { + pub fn get_mac(link: PhysicalLink) -> Result { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[ DLADM, @@ -69,11 +104,11 @@ impl Dladm { &link.0, ]); let output = execute(cmd)?; - let name = String::from_utf8(output.stdout)? + let name = String::from_utf8_lossy(&output.stdout) .lines() .next() .map(|s| s.trim()) - .ok_or_else(|| Error::NotFound)? + .ok_or_else(|| GetMacError::NotFound(link))? .to_string(); // Ensure the MAC address is zero-padded, so it may be parsed as a @@ -99,7 +134,7 @@ impl Dladm { vnic_name: &str, mac: Option, vlan: Option, - ) -> Result<(), Error> { + ) -> Result<(), CreateVnicError> { let mut command = std::process::Command::new(PFEXEC); let mut args = vec![ DLADM.to_string(), @@ -121,17 +156,21 @@ impl Dladm { args.push(vnic_name.to_string()); let cmd = command.args(&args); - execute(cmd)?; + execute(cmd).map_err(|err| CreateVnicError { + name: vnic_name.to_string(), + link: physical.clone(), + err, + })?; Ok(()) } /// Returns all VNICs that may be managed by the Sled Agent. - pub fn get_vnics() -> Result, Error> { + pub fn get_vnics() -> Result, GetVnicError> { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[DLADM, "show-vnic", "-p", "-o", "LINK"]); - let output = execute(cmd)?; + let output = execute(cmd).map_err(|err| GetVnicError { err })?; - let vnics = String::from_utf8(output.stdout)? + let vnics = String::from_utf8_lossy(&output.stdout) .lines() .filter(|vnic| vnic.starts_with(VNIC_PREFIX)) .map(|s| s.to_owned()) @@ -140,10 +179,11 @@ impl Dladm { } /// Remove a vnic from the sled. - pub fn delete_vnic(name: &str) -> Result<(), Error> { + pub fn delete_vnic(name: &str) -> Result<(), DeleteVnicError> { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[DLADM, "delete-vnic", name]); - execute(cmd)?; + execute(cmd) + .map_err(|err| DeleteVnicError { name: name.to_string(), err })?; Ok(()) } } diff --git a/sled-agent/src/illumos/mod.rs b/sled-agent/src/illumos/mod.rs index bdec8e7e702..a59521579ad 100644 --- a/sled-agent/src/illumos/mod.rs +++ b/sled-agent/src/illumos/mod.rs @@ -19,8 +19,8 @@ const PFEXEC: &str = "/usr/bin/pfexec"; #[derive(thiserror::Error, Debug)] pub enum ExecutionError { - #[error("Failed to start execution of process: {0}")] - ExecutionStart(std::io::Error), + #[error("Failed to start execution of [{command}]: {err}")] + ExecutionStart { command: String, err: std::io::Error }, #[error( "Command [{command}] executed and failed with status: {status}. Output: {stderr}" @@ -38,13 +38,22 @@ pub enum ExecutionError { mod inner { use super::*; + fn to_string(command: &mut std::process::Command) -> String { + command + .get_args() + .map(|s| s.to_string_lossy().into()) + .collect::>() + .join(" ") + } + // Helper function for starting the process and checking the // exit code result. pub fn execute( command: &mut std::process::Command, ) -> Result { - let output = - command.output().map_err(|e| ExecutionError::ExecutionStart(e))?; + let output = command.output().map_err(|err| { + ExecutionError::ExecutionStart { command: to_string(command), err } + })?; if !output.status.success() { return Err(ExecutionError::CommandFailure { diff --git a/sled-agent/src/illumos/running_zone.rs b/sled-agent/src/illumos/running_zone.rs index fda2165a714..7dbfbc05cd8 100644 --- a/sled-agent/src/illumos/running_zone.rs +++ b/sled-agent/src/illumos/running_zone.rs @@ -17,31 +17,79 @@ use crate::illumos::zone::MockZones as Zones; #[cfg(not(test))] use crate::illumos::zone::Zones; +/// Errors returned from [`RunningZone::run_cmd`]. #[derive(thiserror::Error, Debug)] -pub enum Error { - #[error("Zone not found")] - NotFound, - - #[error("Zone is not running; it is in the {0:?} state instead")] - NotRunning(zone::State), - - #[error("Execution error: {0}")] - Execution(#[from] crate::illumos::ExecutionError), +#[error("Error running command in zone '{zone}': {err}")] +pub struct RunCommandError { + zone: String, + #[source] + err: crate::illumos::ExecutionError, +} - #[error("Failed to parse output: {0}")] - Parse(#[from] std::string::FromUtf8Error), +/// Errors returned from [`RunningZone::boot`]. +#[derive(thiserror::Error, Debug)] +pub enum BootError { + #[error("Error booting zone: {0}")] + Booting(#[from] crate::illumos::zone::AdmError), - #[error("Zone operation failed: {0}")] - Operation(#[from] crate::illumos::zone::Error), + #[error("Zone booted, but timed out waiting for {service} in {zone}")] + Timeout { service: String, zone: String }, +} - #[error("Zone error accessing datalink: {0}")] - Datalink(#[from] crate::illumos::dladm::Error), +/// Errors returned from [`RunningZone::ensure_address`]. +#[derive(thiserror::Error, Debug)] +pub enum EnsureAddressError { + #[error("Failed ensuring address {request:?} in {zone}: could not construct addrobj name: {err}")] + AddrObject { + request: AddressRequest, + zone: String, + err: crate::illumos::addrobj::ParseError, + }, #[error(transparent)] - AddrObject(#[from] crate::illumos::addrobj::Error), + EnsureAddressError(#[from] crate::illumos::zone::EnsureAddressError), +} - #[error("Timeout waiting for a service: {0}")] - Timeout(String), +/// Erros returned from [`RunningZone::get_zone`]. +#[derive(thiserror::Error, Debug)] +pub enum GetZoneError { + #[error("While looking up zones with prefix '{prefix}', could not get zones: {err}")] + GetZones { + prefix: String, + #[source] + err: crate::illumos::zone::AdmError, + }, + + #[error("Zone with prefix '{prefix}' not found")] + NotFound { prefix: String }, + + #[error("Cannot get zone '{name}': it is in the {state:?} state instead of running")] + NotRunning { name: String, state: zone::State }, + + #[error( + "Cannot get zone '{name}': Failed to acquire control interface {err}" + )] + ControlInterface { + name: String, + #[source] + err: crate::illumos::zone::GetControlInterfaceError, + }, + + #[error("Cannot get zone '{name}': Failed to create addrobj: {err}")] + AddrObject { + name: String, + #[source] + err: crate::illumos::addrobj::ParseError, + }, + + #[error( + "Cannot get zone '{name}': Failed to ensure address exists: {err}" + )] + EnsureAddress { + name: String, + #[source] + err: crate::illumos::zone::EnsureAddressError, + }, } /// Represents a running zone. @@ -55,7 +103,7 @@ impl RunningZone { } /// Runs a command within the Zone, return the output. - pub fn run_cmd(&self, args: I) -> Result + pub fn run_cmd(&self, args: I) -> Result where I: IntoIterator, S: AsRef, @@ -71,15 +119,16 @@ impl RunningZone { .chain(suffix.iter().map(|a| a.as_ref())); let cmd = command.args(full_args); - let output = crate::illumos::execute(cmd)?; - let stdout = String::from_utf8(output.stdout)?; - Ok(stdout) + let output = crate::illumos::execute(cmd) + .map_err(|err| RunCommandError { zone: name.to_string(), err })?; + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout.to_string()) } /// Boots a new zone. /// /// Note that the zone must already be configured to be booted. - pub async fn boot(zone: InstalledZone) -> Result { + pub async fn boot(zone: InstalledZone) -> Result { // Boot the zone. info!(zone.log, "Zone booting"); @@ -88,9 +137,12 @@ impl RunningZone { // Wait for the network services to come online, so future // requests to create addresses can operate immediately. let fmri = "svc:/milestone/network:default"; - wait_for_service(Some(&zone.name), fmri) - .await - .map_err(|_| Error::Timeout(fmri.to_string()))?; + wait_for_service(Some(&zone.name), fmri).await.map_err(|_| { + BootError::Timeout { + service: fmri.to_string(), + zone: zone.name.to_string(), + } + })?; Ok(RunningZone { inner: zone }) } @@ -98,7 +150,7 @@ impl RunningZone { pub async fn ensure_address( &self, addrtype: AddressRequest, - ) -> Result { + ) -> Result { info!(self.inner.log, "Adding address: {:?}", addrtype); let name = match addrtype { AddressRequest::Dhcp => "omicron", @@ -107,7 +159,12 @@ impl RunningZone { std::net::IpAddr::V6(_) => "omicron6", }, }; - let addrobj = AddrObject::new(self.inner.control_vnic.name(), name)?; + let addrobj = AddrObject::new(self.inner.control_vnic.name(), name) + .map_err(|err| EnsureAddressError::AddrObject { + request: addrtype, + zone: self.inner.name.clone(), + err, + })?; let network = Zones::ensure_address(Some(&self.inner.name), &addrobj, addrtype)?; Ok(network) @@ -126,20 +183,42 @@ impl RunningZone { log: &Logger, zone_prefix: &str, addrtype: AddressRequest, - ) -> Result { - let zone_info = Zones::get()? + ) -> Result { + let zone_info = Zones::get() + .map_err(|err| GetZoneError::GetZones { + prefix: zone_prefix.to_string(), + err, + })? .into_iter() .find(|zone_info| zone_info.name().starts_with(&zone_prefix)) - .ok_or_else(|| Error::NotFound)?; + .ok_or_else(|| GetZoneError::NotFound { + prefix: zone_prefix.to_string(), + })?; if zone_info.state() != zone::State::Running { - return Err(Error::NotRunning(zone_info.state())); + return Err(GetZoneError::NotRunning { + name: zone_info.name().to_string(), + state: zone_info.state(), + }); } let zone_name = zone_info.name(); - let vnic_name = Zones::get_control_interface(zone_name)?; - let addrobj = AddrObject::new_control(&vnic_name)?; - Zones::ensure_address(Some(zone_name), &addrobj, addrtype)?; + let vnic_name = + Zones::get_control_interface(zone_name).map_err(|err| { + GetZoneError::ControlInterface { + name: zone_name.to_string(), + err, + } + })?; + let addrobj = AddrObject::new_control(&vnic_name).map_err(|err| { + GetZoneError::AddrObject { name: zone_name.to_string(), err } + })?; + Zones::ensure_address(Some(zone_name), &addrobj, addrtype).map_err( + |err| GetZoneError::EnsureAddress { + name: zone_name.to_string(), + err, + }, + )?; Ok(Self { inner: InstalledZone { @@ -172,6 +251,25 @@ impl Drop for RunningZone { } } +/// Errors returned from [`InstalledZone::install`]. +#[derive(thiserror::Error, Debug)] +pub enum InstallZoneError { + #[error("Cannot create '{service}': failed to create control VNIC: {err}")] + CreateVnic { + service: String, + #[source] + err: crate::illumos::dladm::CreateVnicError, + }, + + #[error("Failed to install zone '{zone}' from '{image_path}': {err}")] + InstallZone { + zone: String, + image_path: PathBuf, + #[source] + err: crate::illumos::zone::AdmError, + }, +} + pub struct InstalledZone { log: Logger, @@ -215,8 +313,13 @@ impl InstalledZone { datasets: &[zone::Dataset], devices: &[zone::Device], vnics: Vec, - ) -> Result { - let control_vnic = vnic_allocator.new_control(None)?; + ) -> Result { + let control_vnic = vnic_allocator.new_control(None).map_err(|err| { + InstallZoneError::CreateVnic { + service: service_name.to_string(), + err, + } + })?; let zone_name = Self::get_zone_name(service_name, unique_name); let zone_image_path = @@ -235,7 +338,12 @@ impl InstalledZone { &datasets, &devices, vnic_names, - )?; + ) + .map_err(|err| InstallZoneError::InstallZone { + zone: zone_name.to_string(), + image_path: zone_image_path.clone(), + err, + })?; Ok(InstalledZone { log: log.new(o!("zone" => zone_name.clone())), diff --git a/sled-agent/src/illumos/vnic.rs b/sled-agent/src/illumos/vnic.rs index 5d5d8292923..d200e4f4bc0 100644 --- a/sled-agent/src/illumos/vnic.rs +++ b/sled-agent/src/illumos/vnic.rs @@ -5,7 +5,10 @@ //! API for controlling a single instance. use crate::common::vlan::VlanID; -use crate::illumos::dladm::{PhysicalLink, VNIC_PREFIX, VNIC_PREFIX_CONTROL}; +use crate::illumos::dladm::{ + CreateVnicError, DeleteVnicError, PhysicalLink, VNIC_PREFIX, + VNIC_PREFIX_CONTROL, +}; use omicron_common::api::external::MacAddr; use std::sync::{ atomic::{AtomicU64, Ordering}, @@ -17,8 +20,6 @@ use crate::illumos::dladm::Dladm; #[cfg(test)] use crate::illumos::dladm::MockDladm as Dladm; -type Error = crate::illumos::dladm::Error; - /// A shareable wrapper around an atomic counter. /// May be used to allocate runtime-unique IDs for objects /// which have naming constraints - such as VNICs. @@ -41,20 +42,12 @@ impl VnicAllocator { /// /// VnicAllocator::new("Storage") produces /// - oxControlStorage[NNN] - pub fn new>( - scope: S, - physical_link: Option, - ) -> Result { - let data_link = if let Some(link) = physical_link { - link - } else { - Dladm::find_physical()? - }; - Ok(Self { + pub fn new>(scope: S, physical_link: PhysicalLink) -> Self { + Self { value: Arc::new(AtomicU64::new(0)), scope: scope.as_ref().to_string(), - data_link, - }) + data_link: physical_link, + } } /// Creates a new NIC, intended for usage by the guest. @@ -62,7 +55,7 @@ impl VnicAllocator { &self, mac: Option, vlan: Option, - ) -> Result { + ) -> Result { let allocator = self.new_superscope("Guest"); let name = allocator.next(); debug_assert!(name.starts_with(VNIC_PREFIX)); @@ -72,7 +65,10 @@ impl VnicAllocator { /// Creates a new NIC, intended for allowing Propolis to communicate /// with the control plane. - pub fn new_control(&self, mac: Option) -> Result { + pub fn new_control( + &self, + mac: Option, + ) -> Result { let allocator = self.new_superscope("Control"); let name = allocator.next(); debug_assert!(name.starts_with(VNIC_PREFIX)); @@ -119,7 +115,7 @@ impl Vnic { } /// Deletes a NIC (if it has not already been deleted). - pub fn delete(&mut self) -> Result<(), Error> { + pub fn delete(&mut self) -> Result<(), DeleteVnicError> { if self.deleted { Ok(()) } else { @@ -149,8 +145,7 @@ mod test { #[test] fn test_allocate() { let allocator = - VnicAllocator::new("Foo", Some(PhysicalLink("mylink".to_string()))) - .unwrap(); + VnicAllocator::new("Foo", PhysicalLink("mylink".to_string())); assert_eq!("oxFoo0", allocator.next()); assert_eq!("oxFoo1", allocator.next()); assert_eq!("oxFoo2", allocator.next()); @@ -159,8 +154,7 @@ mod test { #[test] fn test_allocate_within_scopes() { let allocator = - VnicAllocator::new("Foo", Some(PhysicalLink("mylink".to_string()))) - .unwrap(); + VnicAllocator::new("Foo", PhysicalLink("mylink".to_string())); assert_eq!("oxFoo0", allocator.next()); let allocator = allocator.new_superscope("Baz"); assert_eq!("oxBazFoo1", allocator.next()); diff --git a/sled-agent/src/illumos/zfs.rs b/sled-agent/src/illumos/zfs.rs index 61b468afaa6..e47825d0585 100644 --- a/sled-agent/src/illumos/zfs.rs +++ b/sled-agent/src/illumos/zfs.rs @@ -12,25 +12,74 @@ pub const ZONE_ZFS_DATASET_MOUNTPOINT: &str = "/zone"; pub const ZONE_ZFS_DATASET: &str = "rpool/zone"; const ZFS: &str = "/usr/sbin/zfs"; +/// Error returned by [`Zfs::list_filesystems`]. #[derive(thiserror::Error, Debug)] -pub enum Error { +#[error("Could not list filesystems within zpool {name}: {err}")] +pub struct ListFilesystemsError { + name: String, + #[source] + err: crate::illumos::ExecutionError, +} + +#[derive(thiserror::Error, Debug)] +enum EnsureFilesystemErrorRaw { #[error("ZFS execution error: {0}")] Execution(#[from] crate::illumos::ExecutionError), - #[error("Does not exist: {0}")] - NotFound(String), + #[error("Filesystem does not exist, and formatting was not requested")] + NotFoundNotFormatted, #[error("Unexpected output from ZFS commands: {0}")] Output(String), +} + +/// Error returned by [`Zfs::ensure_zoned_filesystem`]. +#[derive(thiserror::Error, Debug)] +#[error( + "Failed to ensure filesystem '{name}' exists at '{mountpoint:?}': {err}" +)] +pub struct EnsureFilesystemError { + name: String, + mountpoint: Mountpoint, + #[source] + err: EnsureFilesystemErrorRaw, +} - #[error("Failed to parse output: {0}")] - Parse(#[from] std::string::FromUtf8Error), +/// Error returned by [`Zfs::set_oxide_value`] +#[derive(thiserror::Error, Debug)] +#[error( + "Failed to set value '{name}={value}' on filesystem {filesystem}: {err}" +)] +pub struct SetValueError { + filesystem: String, + name: String, + value: String, + err: crate::illumos::ExecutionError, +} + +#[derive(thiserror::Error, Debug)] +enum GetValueErrorRaw { + #[error(transparent)] + Execution(#[from] crate::illumos::ExecutionError), + + #[error("No value found with that name")] + MissingValue, +} + +/// Error returned by [`Zfs:get_oxide_value`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to get value '{name}' from filesystem {filesystem}: {err}")] +pub struct GetValueError { + filesystem: String, + name: String, + err: GetValueErrorRaw, } /// Wraps commands for interacting with ZFS. pub struct Zfs {} /// Describes a mountpoint for a ZFS filesystem. +#[derive(Debug, Clone)] pub enum Mountpoint { #[allow(dead_code)] Legacy, @@ -48,13 +97,18 @@ impl fmt::Display for Mountpoint { #[cfg_attr(test, mockall::automock, allow(dead_code))] impl Zfs { - /// Lists all filesystems within a dataset. - pub fn list_filesystems(name: &str) -> Result, Error> { + /// Lists all filesystems within a zpool. + pub fn list_filesystems( + name: &str, + ) -> Result, ListFilesystemsError> { let mut command = std::process::Command::new(ZFS); let cmd = command.args(&["list", "-d", "1", "-rHpo", "name", name]); - let output = execute(cmd)?; - let stdout = String::from_utf8(output.stdout)?; + let output = execute(cmd).map_err(|err| ListFilesystemsError { + name: name.to_string(), + err, + })?; + let stdout = String::from_utf8_lossy(&output.stdout); let filesystems: Vec = stdout .trim() .split('\n') @@ -71,26 +125,31 @@ impl Zfs { name: &str, mountpoint: Mountpoint, do_format: bool, - ) -> Result<(), Error> { + ) -> Result<(), EnsureFilesystemError> { // If the dataset exists, we're done. let mut command = std::process::Command::new(ZFS); let cmd = command.args(&["list", "-Hpo", "name,type,mountpoint", name]); // If the list command returns any valid output, validate it. if let Ok(output) = execute(cmd) { - let stdout = String::from_utf8(output.stdout)?; + let stdout = String::from_utf8_lossy(&output.stdout); let values: Vec<&str> = stdout.trim().split('\t').collect(); if values != &[name, "filesystem", &mountpoint.to_string()] { - return Err(Error::Output(stdout)); + return Err(EnsureFilesystemError { + name: name.to_string(), + mountpoint, + err: EnsureFilesystemErrorRaw::Output(stdout.to_string()), + }); } return Ok(()); } if !do_format { - return Err(Error::NotFound(format!( - "Filesystem {} not found", - name - ))); + return Err(EnsureFilesystemError { + name: name.to_string(), + mountpoint, + err: EnsureFilesystemErrorRaw::NotFoundNotFormatted, + }); } // If it doesn't exist, make it. @@ -105,7 +164,11 @@ impl Zfs { &format!("mountpoint={}", mountpoint), name, ]); - execute(cmd)?; + execute(cmd).map_err(|err| EnsureFilesystemError { + name: name.to_string(), + mountpoint, + err: err.into(), + })?; Ok(()) } @@ -113,7 +176,7 @@ impl Zfs { filesystem_name: &str, name: &str, value: &str, - ) -> Result<(), Error> { + ) -> Result<(), SetValueError> { Zfs::set_value(filesystem_name, &format!("oxide:{}", name), value) } @@ -121,33 +184,46 @@ impl Zfs { filesystem_name: &str, name: &str, value: &str, - ) -> Result<(), Error> { + ) -> Result<(), SetValueError> { let mut command = std::process::Command::new(PFEXEC); let value_arg = format!("{}={}", name, value); let cmd = command.args(&[ZFS, "set", &value_arg, filesystem_name]); - execute(cmd)?; + execute(cmd).map_err(|err| SetValueError { + filesystem: filesystem_name.to_string(), + name: name.to_string(), + value: value.to_string(), + err, + })?; Ok(()) } pub fn get_oxide_value( filesystem_name: &str, name: &str, - ) -> Result { + ) -> Result { Zfs::get_value(filesystem_name, &format!("oxide:{}", name)) } - fn get_value(filesystem_name: &str, name: &str) -> Result { + fn get_value( + filesystem_name: &str, + name: &str, + ) -> Result { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[ZFS, "get", "-Ho", "value", &name, filesystem_name]); - let output = execute(cmd)?; - let stdout = String::from_utf8(output.stdout)?; + let output = execute(cmd).map_err(|err| GetValueError { + filesystem: filesystem_name.to_string(), + name: name.to_string(), + err: err.into(), + })?; + let stdout = String::from_utf8_lossy(&output.stdout); let value = stdout.trim(); if value == "-" { - return Err(Error::NotFound(format!( - "Property {}, within filesystem {}", - name, filesystem_name - ))); + return Err(GetValueError { + filesystem: filesystem_name.to_string(), + name: name.to_string(), + err: GetValueErrorRaw::MissingValue, + }); } Ok(value.to_string()) } diff --git a/sled-agent/src/illumos/zone.rs b/sled-agent/src/illumos/zone.rs index c3d5e47f3cf..35f2edbe45a 100644 --- a/sled-agent/src/illumos/zone.rs +++ b/sled-agent/src/illumos/zone.rs @@ -4,12 +4,13 @@ //! API for interacting with Zones running Propolis. +use anyhow::anyhow; use ipnetwork::IpNetwork; use slog::Logger; use std::net::{IpAddr, Ipv6Addr}; use crate::illumos::addrobj::AddrObject; -use crate::illumos::dladm::{Dladm, PhysicalLink, VNIC_PREFIX_CONTROL}; +use crate::illumos::dladm::{PhysicalLink, VNIC_PREFIX_CONTROL}; use crate::illumos::zfs::ZONE_ZFS_DATASET_MOUNTPOINT; use crate::illumos::{execute, PFEXEC}; @@ -24,50 +25,86 @@ pub const ZONE_PREFIX: &str = "oxz_"; pub const PROPOLIS_ZONE_PREFIX: &str = "oxz_propolis-server_"; #[derive(thiserror::Error, Debug)] -pub enum Error { - // TODO: These could be grouped into an "operation" error with an enum - // variant, if we want... - #[error("Cannot halt zone: {0}")] - Halt(zone::ZoneError), - - #[error("Cannot uninstall zone: {0}")] - Uninstall(zone::ZoneError), - - #[error("Cannot delete zone: {0}")] - Delete(zone::ZoneError), - - #[error("Cannot install zone: {0}")] - Install(zone::ZoneError), - - #[error("Cannot configure zone: {0}")] - Configure(zone::ZoneError), - - #[error("Cannot clone zone: {0}")] - Clone(zone::ZoneError), +enum Error { + #[error("Zone execution error: {0}")] + Execution(#[from] crate::illumos::ExecutionError), - #[error("Cannot boot zone: {0}")] - Boot(zone::ZoneError), + #[error(transparent)] + AddrObject(#[from] crate::illumos::addrobj::ParseError), - #[error("Cannot list zones: {0}")] - List(zone::ZoneError), + #[error("Address not found: {addrobj}")] + AddressNotFound { addrobj: AddrObject }, +} - #[error("Zone execution error: {0}")] - Execution(#[from] crate::illumos::ExecutionError), +/// Operations issued via [`zone::Adm`]. +#[derive(Debug, Clone)] +pub enum Operation { + Boot, + Configure, + Delete, + Halt, + Install, + List, + Uninstall, +} - #[error("Failed to parse output: {0}")] - Parse(#[from] std::string::FromUtf8Error), +/// Errors from issuing [`zone::Adm`] commands. +#[derive(thiserror::Error, Debug)] +#[error("Failed to execute zoneadm command '{op:?}' for zone '{zone}': {err}")] +pub struct AdmError { + op: Operation, + zone: String, + #[source] + err: zone::ZoneError, +} - #[error(transparent)] - Dladm(#[from] crate::illumos::dladm::Error), +/// Errors which may be encountered when deleting addresses. +#[derive(thiserror::Error, Debug)] +#[error("Failed to delete address '{addrobj}' in zone '{zone}': {err}")] +pub struct DeleteAddressError { + zone: String, + addrobj: AddrObject, + #[source] + err: crate::illumos::ExecutionError, +} - #[error(transparent)] - AddrObject(#[from] crate::illumos::addrobj::Error), +/// Errors from [`Zones::get_control_interface`]. +/// Error which may be returned accessing the control interface of a zone. +#[derive(thiserror::Error, Debug)] +pub enum GetControlInterfaceError { + #[error("Failed to query zone '{zone}' for control interface: {err}")] + Execution { + zone: String, + #[source] + err: crate::illumos::ExecutionError, + }, + + #[error("VNIC starting with 'oxControl' not found in {zone}")] + NotFound { zone: String }, +} - #[error("Error accessing filesystem: {0}")] - Filesystem(std::io::Error), +/// Errors which may be encountered ensuring addresses. +#[derive(thiserror::Error, Debug)] +#[error( + "Failed to create address {request:?} with name {name} in {zone}: {err}" +)] +pub struct EnsureAddressError { + zone: String, + request: AddressRequest, + name: AddrObject, + #[source] + err: anyhow::Error, +} - #[error("Value not found")] - NotFound, +/// Errors from [`Zones::ensure_has_global_zone_v6_address`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to create address {address} with name {name} in the GZ on {link:?}: {err}")] +pub struct EnsureGzAddressError { + address: Ipv6Addr, + link: PhysicalLink, + name: String, + #[source] + err: anyhow::Error, } /// Describes the type of addresses which may be requested from a zone. @@ -101,7 +138,9 @@ impl Zones { /// /// Returns the state the zone was in before it was removed, or None if the /// zone did not exist. - pub fn halt_and_remove(name: &str) -> Result, Error> { + pub fn halt_and_remove( + name: &str, + ) -> Result, AdmError> { match Self::find(name)? { None => Ok(None), Some(zone) => { @@ -117,17 +156,29 @@ impl Zones { }; if halt { - zone::Adm::new(name).halt().map_err(Error::Halt)?; + zone::Adm::new(name).halt().map_err(|err| AdmError { + op: Operation::Halt, + zone: name.to_string(), + err, + })?; } if uninstall { - zone::Adm::new(name) - .uninstall(/* force= */ true) - .map_err(Error::Uninstall)?; + zone::Adm::new(name).uninstall(/* force= */ true).map_err( + |err| AdmError { + op: Operation::Uninstall, + zone: name.to_string(), + err, + }, + )?; } zone::Config::new(name) .delete(/* force= */ true) .run() - .map_err(Error::Delete)?; + .map_err(|err| AdmError { + op: Operation::Delete, + zone: name.to_string(), + err, + })?; Ok(Some(state)) } } @@ -137,7 +188,7 @@ impl Zones { pub fn halt_and_remove_logged( log: &Logger, name: &str, - ) -> Result<(), Error> { + ) -> Result<(), AdmError> { if let Some(state) = Self::halt_and_remove(name)? { info!( log, @@ -154,7 +205,7 @@ impl Zones { datasets: &[zone::Dataset], devices: &[zone::Device], vnics: Vec, - ) -> Result<(), Error> { + ) -> Result<(), AdmError> { if let Some(zone) = Self::find(zone_name)? { info!( log, @@ -204,28 +255,44 @@ impl Zones { ..Default::default() }); } - cfg.run().map_err(Error::Configure)?; + cfg.run().map_err(|err| AdmError { + op: Operation::Configure, + zone: zone_name.to_string(), + err, + })?; info!(log, "Installing Omicron zone: {}", zone_name); - zone::Adm::new(zone_name) - .install(&[zone_image.as_ref()]) - .map_err(Error::Install)?; + zone::Adm::new(zone_name).install(&[zone_image.as_ref()]).map_err( + |err| AdmError { + op: Operation::Install, + zone: zone_name.to_string(), + err, + }, + )?; Ok(()) } /// Boots a zone (named `name`). - pub fn boot(name: &str) -> Result<(), Error> { - zone::Adm::new(name).boot().map_err(Error::Boot)?; + pub fn boot(name: &str) -> Result<(), AdmError> { + zone::Adm::new(name).boot().map_err(|err| AdmError { + op: Operation::Boot, + zone: name.to_string(), + err, + })?; Ok(()) } /// Returns all zones that may be managed by the Sled Agent. /// /// These zones must have names starting with [`ZONE_PREFIX`]. - pub fn get() -> Result, Error> { + pub fn get() -> Result, AdmError> { Ok(zone::Adm::list() - .map_err(Error::List)? + .map_err(|err| AdmError { + op: Operation::List, + zone: "".to_string(), + err, + })? .into_iter() .filter(|z| z.name().starts_with(ZONE_PREFIX)) .collect()) @@ -235,12 +302,14 @@ impl Zones { /// /// Can only return zones that start with [`ZONE_PREFIX`], as they /// are managed by the Sled Agent. - pub fn find(name: &str) -> Result, Error> { + pub fn find(name: &str) -> Result, AdmError> { Ok(Self::get()?.into_iter().find(|zone| zone.name() == name)) } /// Returns the name of the VNIC used to communicate with the control plane. - pub fn get_control_interface(zone: &str) -> Result { + pub fn get_control_interface( + zone: &str, + ) -> Result { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[ ZLOGIN, @@ -251,9 +320,10 @@ impl Zones { "-o", "LINK", ]); - let output = execute(cmd)?; - String::from_utf8(output.stdout) - .map_err(Error::Parse)? + let output = execute(cmd).map_err(|err| { + GetControlInterfaceError::Execution { zone: zone.to_string(), err } + })?; + String::from_utf8_lossy(&output.stdout) .lines() .find_map(|name| { if name.starts_with(VNIC_PREFIX_CONTROL) { @@ -262,7 +332,9 @@ impl Zones { None } }) - .ok_or(Error::NotFound) + .ok_or(GetControlInterfaceError::NotFound { + zone: zone.to_string(), + }) } /// Ensures that an IP address on an interface matches the requested value. @@ -277,23 +349,36 @@ impl Zones { zone: Option<&'a str>, addrobj: &AddrObject, addrtype: AddressRequest, - ) -> Result { - match Self::get_address(zone, addrobj) { - Ok(addr) => { - if let AddressRequest::Static(expected_addr) = addrtype { - // If the address is static, we need to validate that it - // matches the value we asked for. - if addr != expected_addr { - // If the address doesn't match, try removing the old - // value before using the new one. - Self::delete_address(zone, addrobj)?; - return Self::create_address(zone, addrobj, addrtype); + ) -> Result { + |zone, addrobj, addrtype| -> Result { + match Self::get_address(zone, addrobj) { + Ok(addr) => { + if let AddressRequest::Static(expected_addr) = addrtype { + // If the address is static, we need to validate that it + // matches the value we asked for. + if addr != expected_addr { + // If the address doesn't match, try removing the old + // value before using the new one. + Self::delete_address(zone, addrobj) + .map_err(|e| anyhow!(e))?; + return Self::create_address( + zone, addrobj, addrtype, + ) + .map_err(|e| anyhow!(e)); + } } + Ok(addr) } - Ok(addr) + Err(_) => Self::create_address(zone, addrobj, addrtype) + .map_err(|e| anyhow!(e)), } - Err(_) => Self::create_address(zone, addrobj, addrtype), - } + }(zone, addrobj, addrtype) + .map_err(|err| EnsureAddressError { + zone: zone.unwrap_or("global").to_string(), + request: addrtype, + name: addrobj.clone(), + err, + }) } /// Gets the IP address of an interface. @@ -317,10 +402,10 @@ impl Zones { let cmd = command.args(args); let output = execute(cmd)?; - String::from_utf8(output.stdout)? + String::from_utf8_lossy(&output.stdout) .lines() .find_map(|s| s.parse().ok()) - .ok_or(Error::NotFound) + .ok_or(Error::AddressNotFound { addrobj: addrobj.clone() }) } /// Returns Ok(()) if `addrobj` has a corresponding link-local IPv6 address. @@ -344,13 +429,13 @@ impl Zones { let args = prefix.iter().chain(show_addr_args); let cmd = command.args(args); let output = execute(cmd)?; - if let Some(_) = String::from_utf8(output.stdout)? + if let Some(_) = String::from_utf8_lossy(&output.stdout) .lines() .find(|s| s.trim() == "addrconf") { return Ok(()); } - Err(Error::NotFound) + Err(Error::AddressNotFound { addrobj: addrobj.clone() }) } // Attempts to create the requested address. @@ -361,7 +446,7 @@ impl Zones { zone: Option<&'a str>, addrobj: &AddrObject, addrtype: AddressRequest, - ) -> Result<(), Error> { + ) -> Result<(), crate::illumos::ExecutionError> { let mut command = std::process::Command::new(PFEXEC); let mut args = vec![]; if let Some(zone) = zone { @@ -396,7 +481,7 @@ impl Zones { pub fn delete_address<'a>( zone: Option<&'a str>, addrobj: &AddrObject, - ) -> Result<(), Error> { + ) -> Result<(), DeleteAddressError> { let mut command = std::process::Command::new(PFEXEC); let mut args = vec![]; if let Some(zone) = zone { @@ -409,7 +494,11 @@ impl Zones { args.push(addrobj.to_string()); let cmd = command.args(args); - execute(cmd)?; + execute(cmd).map_err(|err| DeleteAddressError { + zone: zone.unwrap_or("global").to_string(), + addrobj: addrobj.clone(), + err, + })?; Ok(()) } @@ -423,12 +512,8 @@ impl Zones { fn ensure_has_link_local_v6_address<'a>( zone: Option<&'a str>, addrobj: &AddrObject, - ) -> Result<(), Error> { - let link_local_addrobj = addrobj.on_same_interface("linklocal")?; - - if let Ok(()) = - Self::has_link_local_v6_address(zone, &link_local_addrobj) - { + ) -> Result<(), crate::illumos::ExecutionError> { + if let Ok(()) = Self::has_link_local_v6_address(zone, &addrobj) { return Ok(()); } @@ -444,7 +529,7 @@ impl Zones { "-t", "-T", "addrconf", - &link_local_addrobj.to_string(), + &addrobj.to_string(), ]; let args = prefix.iter().chain(create_addr_args); @@ -457,33 +542,46 @@ impl Zones { // should remove this function when Sled Agents are provided IPv6 addresses // from RSS. pub fn ensure_has_global_zone_v6_address( - physical_link: Option, + link: PhysicalLink, address: Ipv6Addr, name: &str, - ) -> Result<(), Error> { - // Ensure that addrconf has been set up in the Global Zone. - let link = if let Some(link) = physical_link { - link - } else { - Dladm::find_physical()? - }; - let gz_link_local_addrobj = AddrObject::new(&link.0, "linklocal")?; - Self::ensure_has_link_local_v6_address(None, &gz_link_local_addrobj)?; - - // Ensure that a static IPv6 address has been allocated - // to the Global Zone. Without this, we don't have a way - // to route to IP addresses that we want to create in - // the non-GZ. Note that we use a `/64` prefix, as all addresses - // allocated for services on this sled itself are within the underlay - // prefix. Anything else must be routed through Sidecar. - Self::ensure_address( - None, - &gz_link_local_addrobj.on_same_interface(name)?, - AddressRequest::new_static( - IpAddr::V6(address), - Some(omicron_common::address::SLED_PREFIX), - ), - )?; + ) -> Result<(), EnsureGzAddressError> { + // Call the guts of this function within a closure to make it easier + // to wrap the error with appropriate context. + |link: PhysicalLink, address, name| -> Result<(), anyhow::Error> { + let gz_link_local_addrobj = AddrObject::new(&link.0, "linklocal") + .map_err(|err| anyhow!(err))?; + Self::ensure_has_link_local_v6_address( + None, + &gz_link_local_addrobj, + ) + .map_err(|err| anyhow!(err))?; + + // Ensure that a static IPv6 address has been allocated + // to the Global Zone. Without this, we don't have a way + // to route to IP addresses that we want to create in + // the non-GZ. Note that we use a `/64` prefix, as all addresses + // allocated for services on this sled itself are within the underlay + // prefix. Anything else must be routed through Sidecar. + Self::ensure_address( + None, + &gz_link_local_addrobj + .on_same_interface(name) + .map_err(|err| anyhow!(err))?, + AddressRequest::new_static( + IpAddr::V6(address), + Some(omicron_common::address::SLED_PREFIX), + ), + ) + .map_err(|err| anyhow!(err))?; + Ok(()) + }(link.clone(), address, name) + .map_err(|err| EnsureGzAddressError { + address, + link, + name: name.to_string(), + err, + })?; Ok(()) } @@ -506,9 +604,11 @@ impl Zones { if addr.is_ipv6() { // Finally, actually ensure that the v6 address we want // exists within the zone. + let link_local_addrobj = + addrobj.on_same_interface("linklocal")?; Self::ensure_has_link_local_v6_address( Some(zone), - addrobj, + &link_local_addrobj, )?; } } diff --git a/sled-agent/src/illumos/zpool.rs b/sled-agent/src/illumos/zpool.rs index 67c1c0eaf88..b1d528c05db 100644 --- a/sled-agent/src/illumos/zpool.rs +++ b/sled-agent/src/illumos/zpool.rs @@ -5,7 +5,6 @@ //! Utilities for managing Zpools. use crate::illumos::execute; -use omicron_common::api::external::Error as ExternalError; use serde::{Deserialize, Deserializer}; use std::str::FromStr; use uuid::Uuid; @@ -13,24 +12,24 @@ use uuid::Uuid; const ZPOOL: &str = "/usr/sbin/zpool"; #[derive(thiserror::Error, Debug, PartialEq, Eq)] -pub enum ParseError { - #[error("Failed to parse output as UTF-8: {0}")] - Utf8(#[from] std::string::FromUtf8Error), - - #[error("Failed to parse output: {0}")] - Parse(String), -} +#[error("Failed to parse output: {0}")] +pub struct ParseError(String); #[derive(thiserror::Error, Debug)] -pub enum Error { +enum Error { #[error("Zpool execution error: {0}")] Execution(#[from] crate::illumos::ExecutionError), #[error(transparent)] Parse(#[from] ParseError), +} - #[error("Failed to execute subcommand: {0}")] - Command(ExternalError), +#[derive(thiserror::Error, Debug)] +#[error("Failed to get info for zpool '{name}': {err}")] +pub struct GetInfoError { + name: String, + #[source] + err: Error, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -62,10 +61,7 @@ impl FromStr for ZpoolHealth { "OFFLINE" => Ok(ZpoolHealth::Offline), "REMOVED" => Ok(ZpoolHealth::Removed), "UNAVAIL" => Ok(ZpoolHealth::Unavailable), - _ => Err(ParseError::Parse(format!( - "Unrecognized zpool 'health': {}", - s - ))), + _ => Err(ParseError(format!("Unrecognized zpool 'health': {}", s))), } } } @@ -111,16 +107,10 @@ impl FromStr for ZpoolInfo { fn from_str(s: &str) -> Result { // Lambda helpers for error handling. let expected_field = |name| { - ParseError::Parse(format!( - "Missing '{}' value in zpool list output", - name - )) + ParseError(format!("Missing '{}' value in zpool list output", name)) }; let failed_to_parse = |name, err| { - ParseError::Parse(format!( - "Failed to parse field '{}': {}", - name, err - )) + ParseError(format!("Failed to parse field '{}': {}", name, err)) }; let mut values = s.trim().split_whitespace(); @@ -155,7 +145,7 @@ pub struct Zpool {} #[cfg_attr(test, mockall::automock, allow(dead_code))] impl Zpool { - pub fn get_info(name: &str) -> Result { + pub fn get_info(name: &str) -> Result { let mut command = std::process::Command::new(ZPOOL); let cmd = command.args(&[ "list", @@ -164,11 +154,14 @@ impl Zpool { name, ]); - let output = execute(cmd)?; - let stdout = String::from_utf8(output.stdout) - .map_err(|e| ParseError::Utf8(e))?; - - let zpool = stdout.parse::()?; + let output = execute(cmd).map_err(|err| GetInfoError { + name: name.to_string(), + err: err.into(), + })?; + let stdout = String::from_utf8_lossy(&output.stdout); + let zpool = stdout.parse::().map_err(|err| { + GetInfoError { name: name.to_string(), err: err.into() } + })?; Ok(zpool) } } @@ -294,7 +287,7 @@ mod test { let input = format!("{} {} {} {}", name, size, allocated, free); let result: Result = input.parse(); - let expected_err = ParseError::Parse( + let expected_err = ParseError( "Missing 'health' value in zpool list output".to_owned(), ); assert_eq!(result.unwrap_err(), expected_err,); diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 142037515c3..bf0cce1fa57 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -41,11 +41,8 @@ pub enum Error { #[error("Failed to wait for service: {0}")] Timeout(String), - #[error("Failure accessing data links: {0}")] - Datalink(#[from] crate::illumos::dladm::Error), - - #[error("Error accessing zones: {0}")] - Zone(#[from] crate::illumos::zone::Error), + #[error("Failed to create VNIC: {0}")] + VnicCreation(#[from] crate::illumos::dladm::CreateVnicError), #[error("Failure from Propolis Client: {0}")] Propolis(#[from] propolis_client::Error), @@ -63,10 +60,16 @@ pub enum Error { Migration(anyhow::Error), #[error(transparent)] - RunningZone(#[from] crate::illumos::running_zone::Error), + ZoneCommand(#[from] crate::illumos::running_zone::RunCommandError), + + #[error(transparent)] + ZoneBoot(#[from] crate::illumos::running_zone::BootError), + + #[error(transparent)] + ZoneEnsureAddress(#[from] crate::illumos::running_zone::EnsureAddressError), - #[error("serde_json failure: {0}")] - SerdeJsonError(#[from] serde_json::Error), + #[error(transparent)] + ZoneInstall(#[from] crate::illumos::running_zone::InstallZoneError), } // Issues read-only, idempotent HTTP requests at propolis until it responds with @@ -717,9 +720,8 @@ mod test { let log = logger(); let vnic_allocator = VnicAllocator::new( "Test".to_string(), - Some(PhysicalLink("mylink".to_string())), - ) - .unwrap(); + PhysicalLink("mylink".to_string()), + ); let nexus_client = MockNexusClient::default(); let inst = Instance::new( diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 9a86b3ef62a..2e5970374c6 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -26,12 +26,6 @@ use crate::instance::MockInstance as Instance; pub enum Error { #[error("Instance error: {0}")] Instance(#[from] crate::instance::Error), - - #[error(transparent)] - Dladm(#[from] crate::illumos::dladm::Error), - - #[error(transparent)] - Zone(#[from] crate::illumos::zone::Error), } struct InstanceManagerInternal { @@ -59,17 +53,17 @@ impl InstanceManager { log: Logger, vlan: Option, nexus_client: Arc, - physical_link: Option, - ) -> Result { - Ok(InstanceManager { + physical_link: PhysicalLink, + ) -> InstanceManager { + InstanceManager { inner: Arc::new(InstanceManagerInternal { log, nexus_client, instances: Mutex::new(BTreeMap::new()), vlan, - vnic_allocator: VnicAllocator::new("Instance", physical_link)?, + vnic_allocator: VnicAllocator::new("Instance", physical_link), }), - }) + } } /// Idempotently ensures that the given Instance (described by @@ -266,9 +260,8 @@ mod test { log, None, nexus_client, - Some(PhysicalLink("mylink".to_string())), - ) - .unwrap(); + PhysicalLink("mylink".to_string()), + ); // Verify that no instances exist. assert!(im.inner.instances.lock().unwrap().is_empty()); @@ -347,9 +340,8 @@ mod test { log, None, nexus_client, - Some(PhysicalLink("mylink".to_string())), - ) - .unwrap(); + PhysicalLink("mylink".to_string()), + ); let ticket = Arc::new(std::sync::Mutex::new(None)); let ticket_clone = ticket.clone(); diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 6aa8e61df6d..26f3ce8a321 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -51,8 +51,10 @@ pub struct SledRequest { impl SetupServiceConfig { pub fn from_file>(path: P) -> Result { let path = path.as_ref(); - let contents = std::fs::read_to_string(path)?; - let config = toml::from_str(&contents)?; + let contents = std::fs::read_to_string(&path) + .map_err(|err| ConfigError::Io { path: path.into(), err })?; + let config = toml::from_str(&contents) + .map_err(|err| ConfigError::Parse { path: path.into(), err })?; Ok(config) } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 0cba727b598..9591aec07b6 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -18,14 +18,19 @@ use serde::{Deserialize, Serialize}; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; +use std::path::PathBuf; use thiserror::Error; use tokio::sync::Mutex; /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] pub enum SetupServiceError { - #[error("Error accessing filesystem: {0}")] - Io(#[from] std::io::Error), + #[error("I/O error while {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, #[error("Error making HTTP request to Bootstrap Agent: {0}")] BootstrapApi( @@ -36,17 +41,14 @@ pub enum SetupServiceError { #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] sled_agent_client::Error), - #[error("Cannot deserialize TOML file")] - Toml(#[from] toml::de::Error), + #[error("Cannot deserialize TOML file at {path}: {err}")] + Toml { path: PathBuf, err: toml::de::Error }, #[error("Failed to monitor for peers: {0}")] PeerMonitor(#[from] tokio::sync::broadcast::error::RecvError), - #[error(transparent)] - Http(#[from] reqwest::Error), - - #[error("Configuration changed")] - Configuration, + #[error("Failed to construct an HTTP client: {0}")] + HttpClient(reqwest::Error), } // The workload / information allocated to a single sled. @@ -142,7 +144,8 @@ impl ServiceInner { let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) - .build()?; + .build() + .map_err(SetupServiceError::HttpClient)?; let url = format!("http://{}", bootstrap_addr); info!(self.log, "Sending request to peer agent: {}", url); @@ -197,7 +200,8 @@ impl ServiceInner { let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) - .build()?; + .build() + .map_err(SetupServiceError::HttpClient)?; let client = sled_agent_client::Client::new_with_client( &format!("http://{}", sled_address), client, @@ -243,7 +247,8 @@ impl ServiceInner { let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) - .build()?; + .build() + .map_err(SetupServiceError::HttpClient)?; let client = sled_agent_client::Client::new_with_client( &format!("http://{}", sled_address), client, @@ -289,8 +294,19 @@ impl ServiceInner { let plan: std::collections::HashMap = toml::from_str( - &tokio::fs::read_to_string(&rss_plan_path).await?, - )?; + &tokio::fs::read_to_string(&rss_plan_path).await.map_err( + |err| SetupServiceError::Io { + message: format!( + "Loading RSS plan {rss_plan_path:?}" + ), + err, + }, + )?, + ) + .map_err(|err| SetupServiceError::Toml { + path: rss_plan_path, + err, + })?; Ok(Some(plan)) } else { Ok(None) @@ -370,7 +386,13 @@ impl ServiceInner { .expect("Cannot turn config to string"); info!(self.log, "Plan serialized as: {}", plan_str); - tokio::fs::write(&rss_plan_path(), plan_str).await?; + let path = rss_plan_path(); + tokio::fs::write(&path, plan_str).await.map_err(|err| { + SetupServiceError::Io { + message: format!("Storing RSS plan to {path:?}"), + err, + } + })?; info!(self.log, "Plan written to storage"); Ok(plan) @@ -447,7 +469,15 @@ impl ServiceInner { // We expect this directory to exist - ensure that it does, before any // subsequent operations which may write configs here. - tokio::fs::create_dir_all(omicron_common::OMICRON_CONFIG_PATH).await?; + tokio::fs::create_dir_all(omicron_common::OMICRON_CONFIG_PATH) + .await + .map_err(|err| SetupServiceError::Io { + message: format!( + "Creating config directory {}", + omicron_common::OMICRON_CONFIG_PATH + ), + err, + })?; // Check if a previous RSS plan has completed successfully. // @@ -595,7 +625,15 @@ impl ServiceInner { // Finally, make sure the configuration is saved so we don't inject // the requests on the next iteration. - tokio::fs::rename(rss_plan_path(), rss_completed_plan_path).await?; + let plan_path = rss_plan_path(); + tokio::fs::rename(&plan_path, &rss_completed_plan_path).await.map_err( + |err| SetupServiceError::Io { + message: format!( + "renaming {plan_path:?} to {rss_completed_plan_path:?}" + ), + err, + }, + )?; // TODO Questions to consider: // - What if a sled comes online *right after* this setup? How does diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index f47da988d35..6e588b7fe4f 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -19,25 +19,38 @@ use tokio::sync::Mutex; #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("Cannot serialize TOML file: {0}")] - TomlSerialize(#[from] toml::ser::Error), + #[error("Cannot serialize TOML to file {path}: {err}")] + TomlSerialize { path: PathBuf, err: toml::ser::Error }, - #[error("Cannot deserialize TOML file: {0}")] - TomlDeserialize(#[from] toml::de::Error), + #[error("Cannot deserialize TOML from file {path}: {err}")] + TomlDeserialize { path: PathBuf, err: toml::de::Error }, - #[error("Error accessing filesystem: {0}")] - Io(#[from] std::io::Error), + #[error("I/O Error accessing {path}: {err}")] + Io { path: PathBuf, err: std::io::Error }, - #[error(transparent)] - RunningZone(#[from] crate::illumos::running_zone::Error), + #[error("Failed to do '{intent}' by running command in zone: {err}")] + ZoneCommand { + intent: String, + #[source] + err: crate::illumos::running_zone::RunCommandError, + }, + + #[error("Failed to boot zone: {0}")] + ZoneBoot(#[from] crate::illumos::running_zone::BootError), - #[error("Failed to add address to the global zone: {0}")] - GzAddressFailure(crate::illumos::zone::Error), + #[error(transparent)] + ZoneEnsureAddress(#[from] crate::illumos::running_zone::EnsureAddressError), #[error(transparent)] - Dladm(#[from] crate::illumos::dladm::Error), + ZoneInstall(#[from] crate::illumos::running_zone::InstallZoneError), - #[error("Could not initialize service as requested: {message}")] + #[error("Failed to add GZ addresses: {message}: {err}")] + GzAddress { + message: String, + err: crate::illumos::zone::EnsureGzAddressError, + }, + + #[error("Could not initialize service {service} as requested: {message}")] BadServiceRequest { service: String, message: String }, #[error("Services already configured for this Sled Agent")] @@ -64,7 +77,7 @@ pub struct ServiceManager { config_path: Option, zones: Mutex>, vnic_allocator: VnicAllocator, - physical_link: Option, + physical_link: PhysicalLink, } impl ServiceManager { @@ -73,14 +86,13 @@ impl ServiceManager { /// /// Args: /// - `log`: The logger - /// - `physical_link`: An optional physical link on which to allocate - /// datalinks. By default, the first physical link is used. + /// - `physical_link`: A physical link on which to allocate datalinks. /// - `config_path`: An optional path to a configuration file to store /// the record of services. By default, [`default_services_config_path`] /// is used. pub async fn new( log: Logger, - physical_link: Option, + physical_link: PhysicalLink, config_path: Option, ) -> Result { debug!(log, "Creating new ServiceManager"); @@ -91,7 +103,7 @@ impl ServiceManager { vnic_allocator: VnicAllocator::new( "Service", physical_link.clone(), - )?, + ), physical_link, }; @@ -103,8 +115,14 @@ impl ServiceManager { config_path.to_string_lossy() ); let cfg: ServiceEnsureBody = toml::from_str( - &tokio::fs::read_to_string(&config_path).await?, - )?; + &tokio::fs::read_to_string(&config_path).await.map_err( + |err| Error::Io { path: config_path.clone(), err }, + )?, + ) + .map_err(|err| Error::TomlDeserialize { + path: config_path.clone(), + err, + })?; let mut existing_zones = mgr.zones.lock().await; mgr.initialize_services_locked(&mut existing_zones, &cfg.services) .await?; @@ -198,19 +216,100 @@ impl ServiceManager { *addr, &addr_name, ) - .map_err(|e| Error::GzAddressFailure(e))?; + .map_err(|err| Error::GzAddress { + message: format!( + "adding address on behalf of service '{}'", + service.name + ), + err, + })?; } debug!(self.log, "importing manifest"); - running_zone.run_cmd(&[ - crate::illumos::zone::SVCCFG, - "import", - &format!( - "/var/svc/manifest/site/{}/manifest.xml", - service.name - ), - ])?; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "import", + &format!( + "/var/svc/manifest/site/{}/manifest.xml", + service.name + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: "importing manifest".to_string(), + err, + })?; + + let smf_name = format!("svc:/system/illumos/{}", service.name); + let default_smf_name = format!("{}:default", smf_name); + + match service.name.as_str() { + "internal-dns" => { + info!(self.log, "Setting up internal-dns service"); + let address = + service.addresses.get(0).ok_or_else(|| { + Error::BadServiceRequest { + service: service.name.clone(), + message: "Not enough addresses".to_string(), + } + })?; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!( + "config/server_address=[{}]:{}", + address, DNS_SERVER_PORT + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: "set server address".to_string(), + err, + })?; + + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!( + "config/dns_address=[{}]:{}", + address, DNS_PORT + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: "Set DNS address".to_string(), + err, + })?; + + // Refresh the manifest with the new properties we set, + // so they become "effective" properties when the service is enabled. + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &default_smf_name, + "refresh", + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!( + "Refresh SMF manifest {}", + default_smf_name + ), + err, + })?; + } + _ => { + info!( + self.log, + "Service name {} did not match", service.name + ); + } + } let smf_name = format!("svc:/system/illumos/{}", service.name); let default_smf_name = format!("{}:default", smf_name); @@ -225,36 +324,60 @@ impl ServiceManager { message: "Not enough addresses".to_string(), } })?; - running_zone.run_cmd(&[ - crate::illumos::zone::SVCCFG, - "-s", - &smf_name, - "setprop", - &format!( - "config/server_address=[{}]:{}", - address, DNS_SERVER_PORT - ), - ])?; - - running_zone.run_cmd(&[ - crate::illumos::zone::SVCCFG, - "-s", - &smf_name, - "setprop", - &format!( - "config/dns_address=[{}]:{}", - address, DNS_PORT - ), - ])?; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!( + "config/server_address=[{}]:{}", + address, DNS_SERVER_PORT + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!( + "Setting DNS server address [{}]:{}", + address, DNS_SERVER_PORT + ), + err, + })?; + + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!( + "config/dns_address=[{}]:{}", + address, DNS_PORT + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!( + "Setting DNS address [{}]:{}", + address, DNS_SERVER_PORT + ), + err, + })?; // Refresh the manifest with the new properties we set, // so they become "effective" properties when the service is enabled. - running_zone.run_cmd(&[ - crate::illumos::zone::SVCCFG, - "-s", - &default_smf_name, - "refresh", - ])?; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &default_smf_name, + "refresh", + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!( + "Refreshing DNS service config for {}", + default_smf_name + ), + err, + })?; } _ => { info!( @@ -266,12 +389,17 @@ impl ServiceManager { debug!(self.log, "enabling service"); - running_zone.run_cmd(&[ - crate::illumos::zone::SVCADM, - "enable", - "-t", - &default_smf_name, - ])?; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCADM, + "enable", + "-t", + &default_smf_name, + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!("Enable {} service", default_smf_name), + err, + })?; existing_zones.push(running_zone); } @@ -292,8 +420,14 @@ impl ServiceManager { let services_to_initialize = { if config_path.exists() { let cfg: ServiceEnsureBody = toml::from_str( - &tokio::fs::read_to_string(&config_path).await?, - )?; + &tokio::fs::read_to_string(&config_path).await.map_err( + |err| Error::Io { path: config_path.clone(), err }, + )?, + ) + .map_err(|err| Error::TomlDeserialize { + path: config_path.clone(), + err, + })?; let known_services = cfg.services; let known_set: HashSet<&ServiceRequest> = @@ -330,8 +464,13 @@ impl ServiceManager { let serialized_services = toml::Value::try_from(&request) .expect("Cannot serialize service list"); - tokio::fs::write(&config_path, toml::to_string(&serialized_services)?) - .await?; + let services_str = + toml::to_string(&serialized_services).map_err(|err| { + Error::TomlSerialize { path: config_path.clone(), err } + })?; + tokio::fs::write(&config_path, services_str) + .await + .map_err(|err| Error::Io { path: config_path.clone(), err })?; Ok(()) } @@ -451,7 +590,7 @@ mod test { let config = config_dir.path().join("services.toml"); let mgr = ServiceManager::new( log, - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config), ) .await @@ -475,7 +614,7 @@ mod test { let config = config_dir.path().join("services.toml"); let mgr = ServiceManager::new( log, - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config), ) .await @@ -502,7 +641,7 @@ mod test { // down. let mgr = ServiceManager::new( logctx.log.clone(), - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config.clone()), ) .await @@ -515,7 +654,7 @@ mod test { let _expectations = expect_new_service(); let mgr = ServiceManager::new( logctx.log.clone(), - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config.clone()), ) .await @@ -539,7 +678,7 @@ mod test { // down. let mgr = ServiceManager::new( logctx.log.clone(), - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config.clone()), ) .await @@ -554,7 +693,7 @@ mod test { // Observe that the old service is not re-initialized. let mgr = ServiceManager::new( logctx.log.clone(), - Some(PhysicalLink(EXPECTED_LINK_NAME.to_string())), + PhysicalLink(EXPECTED_LINK_NAME.to_string()), Some(config.clone()), ) .await diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index df3e9e816a4..46090319783 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -34,17 +34,26 @@ use crate::illumos::{ #[derive(thiserror::Error, Debug)] pub enum Error { - #[error(transparent)] - Datalink(#[from] crate::illumos::dladm::Error), + #[error("Physical link not in config, nor found automatically: {0}")] + FindPhysicalLink(#[from] crate::illumos::dladm::FindPhysicalLinkError), + + #[error("Failed to lookup VNICs on boot: {0}")] + GetVnics(#[from] crate::illumos::dladm::GetVnicError), + + #[error("Failed to delete VNIC on boot: {0}")] + DeleteVnic(#[from] crate::illumos::dladm::DeleteVnicError), #[error(transparent)] Services(#[from] crate::services::Error), #[error(transparent)] - Zone(#[from] crate::illumos::zone::Error), + ZoneOperation(#[from] crate::illumos::zone::AdmError), + + #[error("Failed to create Sled Subnet: {err}")] + SledSubnet { err: crate::illumos::zone::EnsureGzAddressError }, #[error(transparent)] - Zfs(#[from] crate::illumos::zfs::Error), + ZfsEnsureFilesystem(#[from] crate::illumos::zfs::EnsureFilesystemError), #[error("Error managing instances: {0}")] Instance(#[from] crate::instance_manager::Error), @@ -95,6 +104,12 @@ impl SledAgent { let vlan = config.vlan; info!(&log, "created sled agent"; "id" => ?id); + let data_link = if let Some(link) = config.data_link.clone() { + link + } else { + Dladm::find_physical()? + }; + // Before we start creating zones, we need to ensure that the // necessary ZFS and Zone resources are ready. Zfs::ensure_zoned_filesystem( @@ -113,10 +128,11 @@ impl SledAgent { // RSS-provided IP address. In the meantime, we use one from the // configuration file. Zones::ensure_has_global_zone_v6_address( - config.data_link.clone(), + data_link.clone(), *sled_address.ip(), "sled6", - )?; + ) + .map_err(|err| Error::SledSubnet { err })?; // Identify all existing zones which should be managed by the Sled // Agent. @@ -150,9 +166,9 @@ impl SledAgent { &log, *id, nexus_client.clone(), - config.data_link.clone(), + data_link.clone(), ) - .await?; + .await; if let Some(pools) = &config.zpools { for pool in pools { info!( @@ -167,11 +183,10 @@ impl SledAgent { log.clone(), vlan, nexus_client.clone(), - config.data_link.clone(), - )?; + data_link.clone(), + ); let services = - ServiceManager::new(log.clone(), config.data_link.clone(), None) - .await?; + ServiceManager::new(log.clone(), data_link.clone(), None).await?; Ok(SledAgent { id: config.id, diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index ff09437993e..24a4d01b330 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -5,9 +5,7 @@ //! Management of sled-local storage. use crate::illumos::dladm::PhysicalLink; -use crate::illumos::running_zone::{ - Error as RunningZoneError, InstalledZone, RunningZone, -}; +use crate::illumos::running_zone::{InstalledZone, RunningZone}; use crate::illumos::vnic::VnicAllocator; use crate::illumos::zone::AddressRequest; use crate::illumos::zpool::ZpoolName; @@ -51,41 +49,71 @@ const CRUCIBLE_AGENT_DEFAULT_SVC: &str = "svc:/oxide/crucible/agent:default"; #[derive(thiserror::Error, Debug)] pub enum Error { + // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] - Datalink(#[from] crate::illumos::dladm::Error), + ZfsListFilesystems(#[from] crate::illumos::zfs::ListFilesystemsError), #[error(transparent)] - Zfs(#[from] crate::illumos::zfs::Error), + ZfsEnsureFilesystem(#[from] crate::illumos::zfs::EnsureFilesystemError), #[error(transparent)] - Zpool(#[from] crate::illumos::zpool::Error), + ZfsSetValue(#[from] crate::illumos::zfs::SetValueError), - #[error("Failed to configure a zone: {0}")] - ZoneConfiguration(crate::illumos::zone::Error), - - #[error("Failed to manage a running zone: {0}")] - ZoneManagement(#[from] crate::illumos::running_zone::Error), - - #[error("Error parsing pool size: {0}")] - BadPoolSize(#[from] ByteCountRangeError), - - #[error("Failed to parse as UUID: {0}")] - Parse(#[from] uuid::Error), + #[error(transparent)] + ZfsGetValue(#[from] crate::illumos::zfs::GetValueError), - #[error("Timed out waiting for service: {0}")] - Timeout(String), + #[error(transparent)] + GetZpoolInfo(#[from] crate::illumos::zpool::GetInfoError), - #[error("Object Not Found: {0}")] - NotFound(String), + #[error(transparent)] + ZoneCommand(#[from] crate::illumos::running_zone::RunCommandError), - #[error("Failed to serialize toml: {0}")] - Serialize(#[from] toml::ser::Error), + #[error(transparent)] + ZoneBoot(#[from] crate::illumos::running_zone::BootError), - #[error("Failed to deserialize toml: {0}")] - Deserialize(#[from] toml::de::Error), + #[error(transparent)] + ZoneEnsureAddress(#[from] crate::illumos::running_zone::EnsureAddressError), - #[error("Failed to perform I/O: {0}")] - Io(#[from] std::io::Error), + #[error(transparent)] + ZoneInstall(#[from] crate::illumos::running_zone::InstallZoneError), + + #[error("Error parsing pool {name}'s size: {err}")] + BadPoolSize { + name: String, + #[source] + err: ByteCountRangeError, + }, + + #[error("Failed to parse the dataset {name}'s UUID: {err}")] + ParseDatasetUuid { + name: String, + #[source] + err: uuid::Error, + }, + + #[error("Zpool Not Found: {0}")] + ZpoolNotFound(String), + + #[error("Failed to serialize toml (intended for {path:?}): {err}")] + Serialize { + path: PathBuf, + #[source] + err: toml::ser::Error, + }, + + #[error("Failed to deserialize toml from {path:?}: {err}")] + Deserialize { + path: PathBuf, + #[source] + err: toml::de::Error, + }, + + #[error("Failed to perform I/O: {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, } /// A ZFS storage pool. @@ -137,7 +165,10 @@ impl Pool { ) -> Result { let path = std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) .join(self.id.to_string()); - create_dir_all(&path).await?; + create_dir_all(&path).await.map_err(|err| Error::Io { + message: format!("creating config dir {path:?}, which would contain config for {dataset_id}"), + err, + })?; let mut path = path.join(dataset_id.to_string()); path.set_extension("toml"); Ok(path) @@ -428,14 +459,17 @@ async fn ensure_running_zone( let address_request = AddressRequest::new_static(dataset_info.address.ip(), None); - match RunningZone::get(log, &dataset_info.zone_prefix(), address_request) - .await - { + let err = + RunningZone::get(log, &dataset_info.zone_prefix(), address_request) + .await; + match err { Ok(zone) => { info!(log, "Zone for {} is already running", dataset_name.full()); return Ok(zone); } - Err(RunningZoneError::NotFound) => { + Err(crate::illumos::running_zone::GetZoneError::NotFound { + .. + }) => { info!(log, "Zone for {} was not found", dataset_name.full()); let installed_zone = InstalledZone::install( @@ -458,14 +492,17 @@ async fn ensure_running_zone( Ok(zone) } - Err(RunningZoneError::NotRunning(_state)) => { + Err(crate::illumos::running_zone::GetZoneError::NotRunning { + name, + state, + }) => { // TODO(https://github.com/oxidecomputer/omicron/issues/725): - unimplemented!("Handle a zone which exists, but is not running"); + unimplemented!("Handle a zone which exists, but is not running: {name}, in {state:?}"); } - Err(_) => { + Err(err) => { // TODO(https://github.com/oxidecomputer/omicron/issues/725): unimplemented!( - "Handle a zone which exists, has some other problem" + "Handle a zone which exists, has some other problem: {err}" ); } } @@ -674,11 +711,15 @@ impl StorageWorker { let mut pools = self.pools.lock().await; let name = ZpoolName::new(request.zpool_id); let pool = pools.get_mut(&name).ok_or_else(|| { - Error::NotFound(format!("zpool: {}", request.zpool_id)) + Error::ZpoolNotFound(format!( + "{}, looked up while trying to add dataset", + request.zpool_id + )) })?; + let pool_name = pool.info.name(); let dataset_info = DatasetInfo::new( - pool.info.name(), + pool_name, request.dataset_kind.clone(), request.address, ); @@ -697,10 +738,18 @@ impl StorageWorker { // Now that the dataset has been initialized, record the configuration // so it can re-initialize itself after a reboot. - let info_str = toml::to_string(&dataset_info)?; let path = pool.dataset_config_path(id).await?; - let mut file = File::create(path).await?; - file.write_all(info_str.as_bytes()).await?; + let info_str = toml::to_string(&dataset_info) + .map_err(|err| Error::Serialize { path: path.clone(), err })?; + let pool_name = pool.info.name(); + let mut file = File::create(&path).await.map_err(|err| Error::Io { + message: format!("Failed creating config file at {path:?} for pool {pool_name}, dataset: {id}"), + err, + })?; + file.write_all(info_str.as_bytes()).await.map_err(|err| Error::Io { + message: format!("Failed writing config to {path:?} for pool {pool_name}, dataset: {id}"), + err, + })?; self.add_datasets_notify( nexus_notifications, @@ -716,16 +765,29 @@ impl StorageWorker { pool: &mut Pool, dataset_name: &DatasetName, ) -> Result<(Uuid, SocketAddr, DatasetKind), Error> { - let id = Zfs::get_oxide_value(&dataset_name.full(), "uuid")? - .parse::()?; + let name = dataset_name.full(); + let id = Zfs::get_oxide_value(&name, "uuid")? + .parse::() + .map_err(|err| Error::ParseDatasetUuid { name, err })?; let config_path = pool.dataset_config_path(id).await?; info!( self.log, "Loading Dataset from {}", config_path.to_string_lossy() ); + let pool_name = pool.info.name(); let dataset_info: DatasetInfo = - toml::from_slice(&tokio::fs::read(config_path).await?)?; + toml::from_slice( + &tokio::fs::read(&config_path).await.map_err(|err| Error::Io { + message: format!("read config for pool {pool_name}, dataset {dataset_name:?} from {config_path:?}"), + err, + })? + ).map_err(|err| { + Error::Deserialize { + path: config_path, + err, + } + })?; self.initialize_dataset_and_zone( pool, &dataset_info, @@ -766,7 +828,8 @@ impl StorageWorker { "Storage manager processing zpool: {:#?}", pool.info ); - let size = ByteCount::try_from(pool.info.size())?; + let size = ByteCount::try_from(pool.info.size()) + .map_err(|err| Error::BadPoolSize { name: pool_name.to_string(), err })?; // If we find filesystems within our datasets, ensure their // zones are up-and-running. @@ -826,8 +889,8 @@ impl StorageManager { log: &Logger, sled_id: Uuid, nexus_client: Arc, - physical_link: Option, - ) -> Result { + physical_link: PhysicalLink, + ) -> Self { let log = log.new(o!("component" => "sled agent storage manager")); let pools = Arc::new(Mutex::new(HashMap::new())); let (new_pools_tx, new_pools_rx) = mpsc::channel(10); @@ -839,14 +902,14 @@ impl StorageManager { pools: pools.clone(), new_pools_rx, new_filesystems_rx, - vnic_allocator: VnicAllocator::new("Storage", physical_link)?, + vnic_allocator: VnicAllocator::new("Storage", physical_link), }; - Ok(StorageManager { + StorageManager { pools, new_pools_tx, new_filesystems_tx, task: tokio::task::spawn(async move { worker.do_work().await }), - }) + } } /// Adds a zpool to the storage manager. diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 38c132a92ab..721861ca074 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -12,8 +12,12 @@ use std::path::PathBuf; #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("I/O Error: {0}")] - Io(#[from] std::io::Error), + #[error("I/O Error: {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, #[error("Failed to contact nexus: {0}")] Nexus(anyhow::Error), @@ -29,7 +33,12 @@ pub async fn download_artifact( match artifact.kind { UpdateArtifactKind::Zone => { let directory = PathBuf::from("/var/tmp/zones"); - tokio::fs::create_dir_all(&directory).await?; + tokio::fs::create_dir_all(&directory).await.map_err(|err| { + Error::Io { + message: format!("creating diretory {directory:?}"), + err, + } + })?; // We download the file to a location named "-". // We then rename it to "" after it has successfully @@ -57,10 +66,25 @@ pub async fn download_artifact( .map_err(Error::Response)?; let contents = response.bytes().await.map_err(|e| Error::Response(e))?; - tokio::fs::write(&tmp_path, contents).await?; + tokio::fs::write(&tmp_path, contents).await.map_err(|err| { + Error::Io { + message: format!( + "Downloading artifact to temporary path: {tmp_path:?}" + ), + err, + } + })?; // Write the file to its final path. - tokio::fs::rename(&tmp_path, directory.join(artifact.name)).await?; + let destination = directory.join(artifact.name); + tokio::fs::rename(&tmp_path, &destination).await.map_err( + |err| Error::Io { + message: format!( + "Renaming {tmp_path:?} to {destination:?}" + ), + err, + }, + )?; Ok(()) } } diff --git a/sled-agent/tests/integration_tests/multicast.rs b/sled-agent/tests/integration_tests/multicast.rs index 1aa9f8b103f..61a0d8284ab 100644 --- a/sled-agent/tests/integration_tests/multicast.rs +++ b/sled-agent/tests/integration_tests/multicast.rs @@ -30,7 +30,7 @@ async fn test_multicast_bootstrap_address() { let address_name = "testbootstrap6"; let addrobj = AddrObject::new(&link.0, address_name).unwrap(); zone::Zones::ensure_has_global_zone_v6_address( - Some(link), + link, *address.ip(), address_name, )