diff --git a/Cargo.lock b/Cargo.lock index fd2bbead354..2e3c405174f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -670,6 +670,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53757d12b596c16c78b83458d732a5d1a17ab3f53f2f7412f6fb57cc8a140ab3" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d0165d2900ae6778e36e80bbc4da3b5eefccee9ba939761f9c2882a5d9af3ff" + [[package]] name = "crc32fast" version = "1.3.2" @@ -1704,6 +1719,7 @@ dependencies = [ "serde_with", "slog", "thiserror", + "tlvc", "tokio", "tokio-stream", "usdt", @@ -5673,6 +5689,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +[[package]] +name = "tlvc" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/tlvc.git#2643765eb7775d1f5e8ec56910f1ab15e9c75170" +dependencies = [ + "byteorder", + "crc", + "zerocopy 0.6.1", +] + [[package]] name = "tokio" version = "1.21.1" diff --git a/gateway-sp-comms/Cargo.toml b/gateway-sp-comms/Cargo.toml index 767dfe0edef..658c0b2adff 100644 --- a/gateway-sp-comms/Cargo.toml +++ b/gateway-sp-comms/Cargo.toml @@ -10,6 +10,7 @@ once_cell = "1.15.0" serde = { version = "1.0", features = ["derive"] } serde_with = "2.0.1" thiserror = "1.0.36" +tlvc = {git = "https://github.com/oxidecomputer/tlvc.git"} tokio-stream = "0.1.10" usdt = "0.3.1" uuid = "1.1.0" diff --git a/gateway-sp-comms/src/error.rs b/gateway-sp-comms/src/error.rs index 2f3d8add5db..06231fbccb8 100644 --- a/gateway-sp-comms/src/error.rs +++ b/gateway-sp-comms/src/error.rs @@ -41,6 +41,10 @@ pub enum UpdateError { SpUpdateFileNotFound { path: String, err: zip::result::ZipError }, #[error("failed to decompress `{path}` within SP update: {err}")] SpUpdateDecompressionFailed { path: String, err: io::Error }, + #[error("error reading aux flash image: {0:?}")] + TlvcError(tlvc::TlvcReadError), + #[error("corrupt aux flash image: {0}")] + CorruptTlvc(String), #[error("failed to send update message to SP: {0}")] Communication(#[from] SpCommunicationError), } diff --git a/gateway-sp-comms/src/hubris_archive.rs b/gateway-sp-comms/src/hubris_archive.rs index 7709d97e626..86bd8f6c9f0 100644 --- a/gateway-sp-comms/src/hubris_archive.rs +++ b/gateway-sp-comms/src/hubris_archive.rs @@ -28,6 +28,10 @@ impl HubrisArchive { self.extract_by_name("img/final.bin") } + pub(crate) fn aux_image(&mut self) -> Result, UpdateError> { + self.extract_by_name("img/auxi.tlvc") + } + fn extract_by_name(&mut self, name: &str) -> Result, UpdateError> { let mut f = self.archive.by_name(name).map_err(|err| { UpdateError::SpUpdateFileNotFound { path: name.to_string(), err } diff --git a/gateway-sp-comms/src/single_sp.rs b/gateway-sp-comms/src/single_sp.rs index dd9ac4c41a8..8996036b214 100644 --- a/gateway-sp-comms/src/single_sp.rs +++ b/gateway-sp-comms/src/single_sp.rs @@ -10,11 +10,9 @@ use crate::communicator::ResponseKindExt; use crate::error::BadResponseType; use crate::error::SpCommunicationError; use crate::error::UpdateError; -use crate::hubris_archive::HubrisArchive; use gateway_messages::sp_impl; use gateway_messages::version; use gateway_messages::BulkIgnitionState; -use gateway_messages::ComponentUpdatePrepare; use gateway_messages::IgnitionCommand; use gateway_messages::IgnitionState; use gateway_messages::PowerState; @@ -27,18 +25,14 @@ use gateway_messages::SpMessage; use gateway_messages::SpMessageKind; use gateway_messages::SpPort; use gateway_messages::SpState; -use gateway_messages::UpdateChunk; -use gateway_messages::UpdateId; use gateway_messages::UpdateStatus; use omicron_common::backoff; use omicron_common::backoff::Backoff; use slog::debug; use slog::error; -use slog::info; use slog::trace; use slog::warn; use slog::Logger; -use std::convert::TryInto; use std::io::Cursor; use std::io::Seek; use std::io::SeekFrom; @@ -55,6 +49,12 @@ use tokio::time; use tokio::time::timeout; use uuid::Uuid; +mod update; + +use self::update::start_component_update; +use self::update::start_sp_update; +use self::update::update_status; + pub const DISCOVERY_MULTICAST_ADDR: Ipv6Addr = Ipv6Addr::new(0xff15, 0, 0, 0, 0, 0, 0x1de, 0); @@ -183,111 +183,30 @@ impl SingleSp { return Err(UpdateError::ImageEmpty); } - // If we're updating the SP, we expect `image` to be a hubris archive; - // extract the SP image from it. - // - // TODO 1: We will need to pull other data out of the archive (aux flash - // images). - // TODO 2: Are we sticking with hubris archives as the delivery format? - let image = if component == SpComponent::SP_ITSELF { - let mut archive = HubrisArchive::new(image)?; - archive.final_bin()? - } else { - image - }; - - let total_size = image - .len() - .try_into() - .map_err(|_err| UpdateError::ImageTooLarge)?; - - info!( - self.log, "starting update"; - "component" => component.as_str(), - "id" => %update_id, - "total_size" => total_size, - ); - let id = update_id.into(); - self.update_prepare(component, id, slot, total_size).await?; - - let log = self.log.clone(); - let inner = self.cmds_tx.clone(); - tokio::spawn(async move { - // Wait until the SP has finished preparing for this update. - match poll_until_update_prep_complete(&inner, component, id, &log) - .await - { - Ok(()) => { - info!( - log, "update preparation complete"; - "update_id" => %update_id, - ); - } - Err(message) => { - error!( - log, "update preparation failed"; - "err" => message, - "update_id" => %update_id, - ); - return; - } + // SP updates are special (`image` is a hubris archive and may include + // an aux flash image in addition to the SP image). + if component == SpComponent::SP_ITSELF { + if slot != 0 { + // We know the SP only has one possible slot, so fail fast if + // the caller requested a slot other than 0. + return Err(UpdateError::Communication( + SpCommunicationError::SpError( + ResponseError::InvalidSlotForComponent, + ), + )); } - - // Deliver the update in chunks. - let mut image = Cursor::new(image); - let mut offset = 0; - while !CursorExt::is_empty(&image) { - let prior_pos = image.position(); - debug!( - log, "sending update chunk"; - "id" => %update_id, - "offset" => offset, - ); - - image = match update_chunk(&inner, component, id, offset, image) - .await - { - Ok(image) => image, - Err(err) => { - error!( - log, "update failed"; - "id" => %update_id, - "err" => %err, - ); - return; - } - }; - - // Update our offset according to how far our cursor advanced. - offset += (image.position() - prior_pos) as u32; - } - info!(log, "update complete"; "id" => %update_id); - }); - - Ok(()) - } - - /// Instruct the SP to begin the update process. - /// - /// This should be followed by a series of `update_chunk()` calls totalling - /// `total_size` bytes of data. - async fn update_prepare( - &self, - component: SpComponent, - id: UpdateId, - slot: u16, - total_size: u32, - ) -> Result<()> { - self.rpc(RequestKind::ComponentUpdatePrepare(ComponentUpdatePrepare { - component, - id, - slot, - total_size, - })) - .await - .and_then(|(_peer, response)| { - response.expect_component_update_prepare_ack().map_err(Into::into) - }) + start_sp_update(&self.cmds_tx, update_id, image, &self.log).await + } else { + start_component_update( + &self.cmds_tx, + component, + update_id, + slot, + image, + &self.log, + ) + .await + } } /// Get the status of any update being applied to the given component. @@ -414,120 +333,6 @@ impl SingleSp { } } -/// Poll an SP until it indicates that preparation for update identified by `id` -/// has completed. -async fn poll_until_update_prep_complete( - inner_tx: &mpsc::Sender, - component: SpComponent, - id: UpdateId, - log: &Logger, -) -> Result<(), String> { - // The choice of interval is relatively arbitrary; we expect update - // preparation to generally fall in one of two cases: - // - // 1. No prep is necessary, and the update can happen immediately - // (we'll never sleep) - // 2. Prep is relatively slow (e.g., erasing a flash part) - // - // We choose a few seconds assuming this polling interval is - // primarily hit when the SP is doing something slow. - const POLL_UPDATE_STATUS_INTERVAL: Duration = Duration::from_secs(2); - - // Poll SP until update preparation is complete. - loop { - // Get update status from the SP or give up. - let status = match update_status(inner_tx, component).await { - Ok(status) => status, - Err(err) => { - return Err(format!("could not get status from SP: {err}")); - } - }; - - // Either sleep and retry (if still preparing), break out of our - // loop (if prep complete), or fail (anything else). - match status { - UpdateStatus::Preparing(sub_status) => { - if sub_status.id == id { - debug!( - log, - "SP still preparing; sleeping for {:?}", - POLL_UPDATE_STATUS_INTERVAL - ); - tokio::time::sleep(POLL_UPDATE_STATUS_INTERVAL).await; - continue; - } - } - UpdateStatus::SpUpdateAuxFlashChckScan { .. } => { - return Err("SP returned unexpected status (aux flash scan?!)" - .to_string()); - } - UpdateStatus::InProgress(sub_status) => { - if sub_status.id == id { - return Ok(()); - } - } - UpdateStatus::None - | UpdateStatus::Complete(_) - | UpdateStatus::Aborted(_) => (), - UpdateStatus::Failed { id: failed_id, code } => { - if id == failed_id { - return Err(format!("updated failed (SP code {code})")); - } else { - let failed_id = Uuid::from(failed_id); - return Err(format!( - "different SP update failed ({failed_id})" - )); - } - } - } - - return Err(format!("update preparation failed; status = {status:?}")); - } -} - -/// Get the status of any update being applied to the given component. -async fn update_status( - inner_tx: &mpsc::Sender, - component: SpComponent, -) -> Result { - rpc(inner_tx, RequestKind::UpdateStatus(component), None) - .await - .result - .and_then(|(_peer, response)| { - response.expect_update_status().map_err(Into::into) - }) -} - -/// Send a portion of an update to the SP. -/// -/// Must be preceded by a call to `update_prepare()` (and may be preceded by -/// earlier chunks of this update)`. -/// -/// The completion of an update is implicit, and is detected by the SP based -/// on size of the update (specified by the `total_size` given when the -/// update starts). -async fn update_chunk( - inner_tx: &mpsc::Sender, - component: SpComponent, - id: UpdateId, - offset: u32, - data: Cursor>, -) -> Result>> { - let update_chunk = UpdateChunk { component, id, offset }; - let (result, data) = rpc_with_trailing_data( - inner_tx, - RequestKind::UpdateChunk(update_chunk), - data, - ) - .await; - - result.and_then(|(_peer, response)| { - response.expect_update_chunk_ack().map_err(Into::into) - })?; - - Ok(data) -} - async fn rpc_with_trailing_data( inner_tx: &mpsc::Sender, kind: RequestKind, diff --git a/gateway-sp-comms/src/single_sp/update.rs b/gateway-sp-comms/src/single_sp/update.rs new file mode 100644 index 00000000000..90e723f843b --- /dev/null +++ b/gateway-sp-comms/src/single_sp/update.rs @@ -0,0 +1,473 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2022 Oxide Computer Company + +use super::CursorExt; +use super::InnerCommand; +use super::Result; +use crate::communicator::ResponseKindExt; +use crate::error::UpdateError; +use crate::hubris_archive::HubrisArchive; +use gateway_messages::ComponentUpdatePrepare; +use gateway_messages::RequestKind; +use gateway_messages::SpComponent; +use gateway_messages::SpUpdatePrepare; +use gateway_messages::UpdateChunk; +use gateway_messages::UpdateId; +use gateway_messages::UpdateStatus; +use slog::debug; +use slog::error; +use slog::info; +use slog::Logger; +use std::convert::TryInto; +use std::io::Cursor; +use std::time::Duration; +use tlvc::TlvcReader; +use tokio::sync::mpsc; +use uuid::Uuid; + +/// Start an update to the SP itself. +/// +/// If the SP acks that the update can begin, spawns a task to deliver the +/// update. +pub(super) async fn start_sp_update( + cmds_tx: &mpsc::Sender, + update_id: Uuid, + image: Vec, + log: &Logger, +) -> Result<(), UpdateError> { + let mut archive = HubrisArchive::new(image)?; + + let sp_image = archive.final_bin()?; + let sp_image_size = + sp_image.len().try_into().map_err(|_err| UpdateError::ImageTooLarge)?; + + let aux_image = match archive.aux_image() { + Ok(aux_image) => Some(aux_image), + Err(UpdateError::SpUpdateFileNotFound { .. }) => None, + Err(err) => return Err(err), + }; + + let (aux_flash_size, aux_flash_chck) = match &aux_image { + Some(data) => { + let size = data + .len() + .try_into() + .map_err(|_err| UpdateError::ImageTooLarge)?; + let chck = read_auxi_check_from_tlvc(data)?; + (size, chck) + } + None => (0, [0; 32]), + }; + + info!( + log, "starting SP update"; + "id" => %update_id, + "aux_flash_chck" => ?aux_flash_chck, + "aux_flash_size" => aux_flash_size, + "sp_image_size" => sp_image_size, + ); + super::rpc( + cmds_tx, + RequestKind::SpUpdatePrepare(SpUpdatePrepare { + id: update_id.into(), + aux_flash_size, + aux_flash_chck, + sp_image_size, + }), + None, + ) + .await + .result + .and_then(|(_peer, response)| { + response.expect_sp_update_prepare_ack().map_err(Into::into) + })?; + + tokio::spawn(drive_sp_update( + cmds_tx.clone(), + update_id, + aux_image, + sp_image, + log.clone(), + )); + + Ok(()) +} + +/// Function that should be `tokio::spawn`'d to drive an SP update to +/// completion. +async fn drive_sp_update( + cmds_tx: mpsc::Sender, + update_id: Uuid, + aux_image: Option>, + sp_image: Vec, + log: Logger, +) { + let id = update_id.into(); + + // Wait until the SP has finished preparing for this update. + let sp_matched_chck = match poll_until_update_prep_complete( + &cmds_tx, + SpComponent::SP_ITSELF, + id, + aux_image.is_some(), + &log, + ) + .await + { + Ok(sp_matched_chck) => { + info!( + log, "update preparation complete"; + "update_id" => %update_id, + ); + sp_matched_chck + } + Err(message) => { + error!( + log, "update preparation failed"; + "err" => message, + "update_id" => %update_id, + ); + return; + } + }; + + // Send the aux flash image, if necessary. + if !sp_matched_chck { + // `poll_until_update_prep_complete` can only return `Ok(false)` if we + // told it we had an aux flash update (i.e., if `aux_image.is_some()`). + // Therefore, we can safely unwrap here. + let data = aux_image.unwrap(); + match send_update_in_chunks( + &cmds_tx, + SpComponent::SP_AUX_FLASH, + update_id, + data, + &log, + ) + .await + { + Ok(()) => { + info!(log, "aux flash update complete"; "id" => %update_id); + } + Err(err) => { + error!( + log, "aux flash update failed"; + "id" => %update_id, + "err" => %err, + ); + return; + } + } + } + + // Deliver the SP image. + match send_update_in_chunks( + &cmds_tx, + SpComponent::SP_ITSELF, + update_id, + sp_image, + &log, + ) + .await + { + Ok(()) => { + info!(log, "update complete"; "id" => %update_id); + } + Err(err) => { + error!( + log, "update failed"; + "id" => %update_id, + "err" => %err, + ); + } + } +} + +fn read_auxi_check_from_tlvc(data: &[u8]) -> Result<[u8; 32], UpdateError> { + let mut reader = TlvcReader::begin(data).map_err(UpdateError::TlvcError)?; + let mut chck = None; + + while let Some(chunk) = reader.next().map_err(UpdateError::TlvcError)? { + if chunk.header().tag != *b"CHCK" { + // We could recompute the hash on AUXI and make sure it + // matches, but the SP has to do that itself anyway. We don't expect + // them to be mismatched more or less ever, so we won't bother + // checking here and will just let the SP do it. + continue; + } + if chunk.len() != 32 { + return Err(UpdateError::CorruptTlvc(format!( + "expected 32-long chck, got {}", + chunk.len() + ))); + } + if chck.is_some() { + return Err(UpdateError::CorruptTlvc( + "multiple CHCK entries".to_string(), + )); + } + + let mut data = [0; 32]; + chunk.read_exact(0, &mut data[..]).map_err(UpdateError::TlvcError)?; + chck = Some(data); + } + + chck.ok_or_else(|| { + UpdateError::CorruptTlvc("missing CHCK entry".to_string()) + }) +} + +/// Start an update to a component of the SP. +/// +/// If the SP acks that the update can begin, spawns a task to deliver the +/// update. +pub(super) async fn start_component_update( + cmds_tx: &mpsc::Sender, + component: SpComponent, + update_id: Uuid, + slot: u16, + image: Vec, + log: &Logger, +) -> Result<(), UpdateError> { + let total_size = + image.len().try_into().map_err(|_err| UpdateError::ImageTooLarge)?; + + info!( + log, "starting update"; + "component" => component.as_str(), + "id" => %update_id, + "total_size" => total_size, + ); + super::rpc( + cmds_tx, + RequestKind::ComponentUpdatePrepare(ComponentUpdatePrepare { + component, + id: update_id.into(), + slot, + total_size, + }), + None, + ) + .await + .result + .and_then(|(_peer, response)| { + response.expect_component_update_prepare_ack().map_err(Into::into) + })?; + + tokio::spawn(drive_component_update( + cmds_tx.clone(), + component, + update_id, + image, + log.clone(), + )); + + Ok(()) +} + +/// Function that should be `tokio::spawn`'d to drive a component update to +/// completion. +async fn drive_component_update( + cmds_tx: mpsc::Sender, + component: SpComponent, + update_id: Uuid, + image: Vec, + log: Logger, +) { + let id = update_id.into(); + + // Wait until the SP has finished preparing for this update. + match poll_until_update_prep_complete(&cmds_tx, component, id, false, &log) + .await + { + Ok(_) => { + info!( + log, "update preparation complete"; + "update_id" => %update_id, + ); + } + Err(message) => { + error!( + log, "update preparation failed"; + "err" => message, + "update_id" => %update_id, + ); + return; + } + } + + // Deliver the update in chunks. + match send_update_in_chunks(&cmds_tx, component, update_id, image, &log) + .await + { + Ok(()) => { + info!(log, "update complete"; "id" => %update_id); + } + Err(err) => { + error!( + log, "update failed"; + "id" => %update_id, + "err" => %err, + ); + } + } +} + +/// Poll an SP until it indicates that preparation for update identified by `id` +/// has completed. +/// +/// If `update_has_aux_image` is `true` (i.e., the update we're waiting on is an +/// SP update with an aux flash image), we poll until we see the +/// `SpUpdateAuxFlashChckScan` status from the SP, and then return `true` or +/// `false` indicating whether the SP found a matching CHCK (i.e., returning +/// `Ok(true)` means the SP found a matching CHCK, and we don't need to send the +/// aux flash image). Receiving an `InProgress` status will result in an error +/// being returned, as we don't expect to see that state until we start sending +/// data. +/// +/// If `update_has_aux_image` is `false`, we poll until we see the `InProgress` +/// status from the SP. Receiving an `SpUpdateAuxFlashChckScan` status will +/// result in an error being returned. We always return `Ok(true)` upon seeing +/// `InProgress` (i.e., if `update_has_aux_image` is `false`, we will either +/// return `Ok(true)` or an error, never `Ok(false)`). +async fn poll_until_update_prep_complete( + cmds_tx: &mpsc::Sender, + component: SpComponent, + id: UpdateId, + update_has_aux_image: bool, + log: &Logger, +) -> Result { + // The choice of interval is relatively arbitrary; we expect update + // preparation to generally fall in one of two cases: + // + // 1. No prep is necessary, and the update can happen immediately + // (we'll never sleep) + // 2. Prep is relatively slow (e.g., erasing a flash part) + // + // We choose a few seconds assuming this polling interval is + // primarily hit when the SP is doing something slow. + const POLL_UPDATE_STATUS_INTERVAL: Duration = Duration::from_secs(2); + + // Poll SP until update preparation is complete. + loop { + // Get update status from the SP or give up. + let status = match update_status(cmds_tx, component).await { + Ok(status) => status, + Err(err) => { + return Err(format!("could not get status from SP: {err}")); + } + }; + + // Either sleep and retry (if still preparing), break out of our + // loop (if prep complete), or fail (anything else). + match status { + UpdateStatus::Preparing(sub_status) => { + if sub_status.id == id { + debug!( + log, + "SP still preparing; sleeping for {:?}", + POLL_UPDATE_STATUS_INTERVAL + ); + tokio::time::sleep(POLL_UPDATE_STATUS_INTERVAL).await; + continue; + } + // Else: fall through to returning an error. + } + UpdateStatus::InProgress(sub_status) => { + if sub_status.id == id && !update_has_aux_image { + return Ok(true); + } + // Else: fall through to returning an error. + } + UpdateStatus::SpUpdateAuxFlashChckScan { + id: sp_id, + found_match, + .. + } => { + if sp_id == id && update_has_aux_image { + return Ok(found_match); + } + // Else: fall through to returning an error. + } + UpdateStatus::None + | UpdateStatus::Complete(_) + | UpdateStatus::Failed { .. } + | UpdateStatus::Aborted(_) => { + // Fall through to returning an error below. + } + } + + return Err(format!("update preparation failed; status = {status:?}")); + } +} + +/// Get the status of any update being applied to the given component. +pub(super) async fn update_status( + cmds_tx: &mpsc::Sender, + component: SpComponent, +) -> Result { + super::rpc(cmds_tx, RequestKind::UpdateStatus(component), None) + .await + .result + .and_then(|(_peer, response)| { + response.expect_update_status().map_err(Into::into) + }) +} + +/// Send an update image to the SP in chunks. +async fn send_update_in_chunks( + cmds_tx: &mpsc::Sender, + component: SpComponent, + update_id: Uuid, + data: Vec, + log: &Logger, +) -> Result<()> { + let mut image = Cursor::new(data); + let mut offset = 0; + let id = update_id.into(); + while !CursorExt::is_empty(&image) { + let prior_pos = image.position(); + debug!( + log, "sending update chunk"; + "id" => %update_id, + "offset" => offset, + ); + + image = + send_single_update_chunk(&cmds_tx, component, id, offset, image) + .await?; + + // Update our offset according to how far our cursor advanced. + offset += (image.position() - prior_pos) as u32; + } + Ok(()) +} + +/// Send a portion of an update to the SP. +/// +/// `data` is moved into this function, updated based on the amount delivered in +/// this chunk, and returned. +async fn send_single_update_chunk( + cmds_tx: &mpsc::Sender, + component: SpComponent, + id: UpdateId, + offset: u32, + data: Cursor>, +) -> Result>> { + let update_chunk = UpdateChunk { component, id, offset }; + let (result, data) = super::rpc_with_trailing_data( + cmds_tx, + RequestKind::UpdateChunk(update_chunk), + data, + ) + .await; + + result.and_then(|(_peer, response)| { + response.expect_update_chunk_ack().map_err(Into::into) + })?; + + Ok(data) +}