From 6d3b503c6dc86af09661a098b3e159156bc70cd9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Oct 2025 10:49:17 -0700 Subject: [PATCH 1/8] [support bundle] Refactor into tasks --- .../tasks/support_bundle_collector.rs | 682 ++++++++++++------ 1 file changed, 457 insertions(+), 225 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 8dc13e7ab42..ee2224edba8 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -59,12 +59,11 @@ use std::future::Future; use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::AsyncWriteExt; use tokio::io::SeekFrom; -use tokio_util::task::AbortOnDropHandle; +use tokio::sync::OnceCell; use tufaceous_artifact::ArtifactHash; use uuid::Uuid; use zip::ZipArchive; @@ -428,8 +427,6 @@ impl SupportBundleCollector { request: request.clone(), bundle: bundle.clone(), transfer_chunk_size: request.transfer_chunk_size, - host_ereports_collected: AtomicUsize::new(0), - sp_ereports_collected: AtomicUsize::new(0), }); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -475,8 +472,60 @@ struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, - host_ereports_collected: AtomicUsize, - sp_ereports_collected: AtomicUsize, +} + +type CollectionStepFn = Box< + dyn for<'b> FnOnce( + &'b Arc, + &'b Utf8Path, + ) + -> BoxFuture<'b, anyhow::Result> + + Send, +>; + +enum CollectionStepOutput { + HostEreports(SupportBundleEreportStatus), + SpEreports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The ditinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + None, +} + +impl CollectionStepOutput { + // Updates the collection report based on the output of a collection step, + // and possibly extends the set of all steps to be executed. + fn process( + self, + report: &mut SupportBundleCollectionReport, + steps: &mut Vec<(&'static str, CollectionStepFn)>, + ) { + match self { + CollectionStepOutput::HostEreports(status) => { + report.host_ereports = status; + } + CollectionStepOutput::SpEreports(status) => { + report.sp_ereports = status; + } + CollectionStepOutput::SavingSpDumps { listed_sps } => { + report.listed_sps = listed_sps; + } + CollectionStepOutput::SpawnSleds { extra_steps } => { + report.listed_in_service_sleds = true; + steps.extend(extra_steps); + } + CollectionStepOutput::Spawn { extra_steps } => { + steps.extend(extra_steps); + } + CollectionStepOutput::None => (), + } + } } impl BundleCollection { @@ -656,37 +705,72 @@ impl BundleCollection { Ok(()) } - // Perform the work of collecting the support bundle into a temporary directory - // - // - "dir" is a directory where data can be stored. - // - "bundle" is metadata about the bundle being collected. - // - // If a partial bundle can be collected, it should be returned as - // an Ok(SupportBundleCollectionReport). Any failures from this function - // will prevent the support bundle from being collected altogether. - // - // NOTE: The background task infrastructure will periodically check to see - // if the bundle has been cancelled by a user while it is being collected. - // If that happens, this function will be CANCELLED at an await point. - // - // As a result, it is important that this function be implemented as - // cancel-safe. - async fn collect_bundle_as_file( + async fn run_collect_bundle_steps( self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - let log = &self.log; - - info!(&log, "Collecting bundle as local file"); + output: &Utf8TempDir, + mut steps: Vec<(&'static str, CollectionStepFn)>, + ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); - tokio::fs::write( - dir.path().join("bundle_id.txt"), - self.bundle.id.to_string(), - ) - .await?; + const MAX_CONCURRENT_STEPS: usize = 16; + let mut tasks = + ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); + + loop { + // Process all the currently-planned steps + while let Some((step_name, step)) = steps.pop() { + let previous_result = tasks.spawn({ + let collection = self.clone(); + let dir = output.path().to_path_buf(); + async move { + debug!(collection.log, "Running step"; "name" => &step_name); + step(&collection, dir.as_path()).await.inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "name" => &step_name, + InlineErrorChain::new(err.as_ref()), + ); + }) + } + }).await; + + if let Some(Ok(output)) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // If we've run out of tasks to spawn, join all the existing steps. + while let Some(previous_result) = tasks.join_next().await { + if let Ok(output) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // Executing steps may create additional steps, as follow-up work. + // + // Only finish if we've exhausted all possible steps and joined all spawned work. + if steps.is_empty() { + return report; + } + } + } + async fn collect_bundle_id( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { + tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string()) + .await?; + + Ok(CollectionStepOutput::None) + } + + async fn collect_reconfigurator_state( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { // Collect reconfigurator state const NMAX_BLUEPRINTS: usize = 300; match reconfigurator_state_load( @@ -697,7 +781,7 @@ impl BundleCollection { .await { Ok(state) => { - let file_path = dir.path().join("reconfigurator_state.json"); + let file_path = dir.join("reconfigurator_state.json"); let file = std::fs::OpenOptions::new() .create(true) .write(true) @@ -713,7 +797,7 @@ impl BundleCollection { }, )?; info!( - log, + self.log, "Support bundle: collected reconfigurator state"; "target_blueprint" => ?state.target_blueprint, "num_blueprints" => state.blueprints.len(), @@ -722,152 +806,322 @@ impl BundleCollection { } Err(err) => { warn!( - log, + self.log, "Support bundle: failed to collect reconfigurator state"; "err" => ?err, ); } - } + }; + + Ok(CollectionStepOutput::None) + } - let ereport_collection = if let Some(ref ereport_filters) = - self.request.ereport_query + async fn collect_host_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_host_ereports(ereport_filters.clone(), ereports_dir.clone()) + .await { - // If ereports are to be included in the bundle, have someone go do - // that in the background while we're gathering up other stuff. Note - // that the `JoinHandle`s for these tasks are wrapped in - // `AbortOnDropHandle`s for cancellation correctness; this ensures - // that if collecting the bundle is cancelled and this future is - // dropped, the tasks that we've spawned to collect ereports are - // aborted as well. - let dir = dir.path().join("ereports"); - let host = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_host_ereports( - ereport_filters.clone(), - dir.clone(), - ), - )); - let sp = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_sp_ereports(ereport_filters.clone(), dir), - )); - Some((host, sp)) - } else { - debug!(log, "Support bundle: ereports not requested"); - None + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } + } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: host ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); + + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } }; - let all_sleds = self - .datastore - .sled_list_all_batched(&self.opctx, SledFilter::InService) - .await; + Ok(CollectionStepOutput::HostEreports(status)) + } - if let Ok(mgs_client) = self.create_mgs_client().await { - if let Err(e) = write_sled_info( - &self.log, - &mgs_client, - all_sleds.as_deref().ok(), - dir.path(), - ) + async fn collect_sp_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_sp_ereports(ereport_filters.clone(), ereports_dir.clone()) .await - { - error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref())); + { + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: sp ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); - let sp_dumps_dir = dir.path().join("sp_task_dumps"); - tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context( - || { - format!( - "Failed to create SP task dump directory {sp_dumps_dir}" - ) - }, - )?; + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } + }; - if let Err(e) = - save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await - { - error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref())); - } else { - report.listed_sps = true; - }; - } else { - warn!(log, "No MGS client, skipping SP task dump collection"); - } + Ok(CollectionStepOutput::SpEreports(status)) + } - if let Ok(all_sleds) = all_sleds { - report.listed_in_service_sleds = true; + async fn get_or_initialize_mgs_client<'a>( + &self, + mgs_client: &'a OnceCell>>, + ) -> &'a Arc> { + mgs_client + .get_or_init(|| async { + Arc::new(self.create_mgs_client().await.ok()) + }) + .await + } - const MAX_CONCURRENT_SLED_REQUESTS: usize = 16; - const FAILURE_MESSAGE: &str = - "Failed to fully collect support bundle info from sled"; - let mut set = ParallelTaskSet::new_with_parallelism( - MAX_CONCURRENT_SLED_REQUESTS, + async fn get_or_initialize_all_sleds<'a>( + &self, + all_sleds: &'a OnceCell>>>, + ) -> &'a Arc>> { + all_sleds + .get_or_init(|| async { + Arc::new( + self.datastore + .sled_list_all_batched( + &self.opctx, + SledFilter::InService, + ) + .await + .ok(), + ) + }) + .await + } + + async fn collect_sled_cubby_info( + &self, + all_sleds: &OnceCell>>>, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!( + self.log, + "No MGS client, skipping sled cubby info collection" ); + return Ok(CollectionStepOutput::None); + }; + let nexus_sleds = self + .get_or_initialize_all_sleds(all_sleds) + .await + .as_deref() + .unwrap_or_default(); + + write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?; + + Ok(CollectionStepOutput::None) + } + + async fn spawn_sp_dump_collection( + &self, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!(self.log, "No MGS client, skipping SP task dump collection"); + return Ok(CollectionStepOutput::None); + }; - for sled in all_sleds { - let prev_result = set - .spawn({ - let collection: Arc = self.clone(); - let dir = dir.path().to_path_buf(); + let sp_dumps_dir = dir.join("sp_task_dumps"); + tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { + format!("Failed to create SP task dump directory {sp_dumps_dir}") + })?; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sp in get_available_sps(&mgs_client).await? { + extra_steps.push(( + "sp dump", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { async move { - collection.collect_data_from_sled(&sled, &dir).await + collection + .collect_sp_dump(&mgs_client, sp, dir) + .await } - }) - .await; - if let Some(Err(err)) = prev_result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } - while let Some(result) = set.join_next().await { - if let Err(err) = result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } + .boxed() + } + }), + )); } - if let Some((host, sp)) = ereport_collection { - let (host, sp) = tokio::join!(host, sp); - const TASK_FAILURE_MSG: &str = "task failed"; - let n_collected = - self.host_ereports_collected.load(Ordering::Acquire); - report.host_ereports = match host - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: host ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + Ok(CollectionStepOutput::Spawn { extra_steps }) + } + + async fn collect_sp_dump( + &self, + mgs_client: &MgsClient, + sp: SpIdentifier, + dir: &Utf8Path, + ) -> anyhow::Result { + save_sp_dumps(mgs_client, sp, dir) + .await + .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; + + Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) + } + + // Perform the work of collecting the support bundle into a temporary directory + // + // - "dir" is a directory where data can be stored. + // - "bundle" is metadata about the bundle being collected. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(SupportBundleCollectionReport). Any failures from this function + // will prevent the support bundle from being collected altogether. + // + // NOTE: The background task infrastructure will periodically check to see + // if the bundle has been cancelled by a user while it is being collected. + // If that happens, this function will be CANCELLED at an await point. + // + // As a result, it is important that this function be implemented as + // cancel-safe. + async fn collect_bundle_as_file( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + + // Shared, lazy, fallible initialization for sleds + let all_sleds: OnceCell>>> = OnceCell::new(); + // Shared, lazy, fallible initialization for MGS client + let mgs_client: OnceCell>> = OnceCell::new(); + + let steps: Vec<(&str, CollectionStepFn)> = vec![ + ( + "bundle id", + Box::new(|collection, dir| { + collection.collect_bundle_id(dir).boxed() + }), + ), + ( + "reconfigurator state", + Box::new(|collection, dir| { + collection.collect_reconfigurator_state(dir).boxed() + }), + ), + ( + "host ereports", + Box::new(|collection, dir| { + collection.collect_host_ereports(dir).boxed() + }), + ), + ( + "sp ereports", + Box::new(|collection, dir| { + collection.collect_sp_ereports(dir).boxed() + }), + ), + ( + "sled cubby info", + Box::new({ + let all_sleds = all_sleds.clone(); + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .collect_sled_cubby_info( + &all_sleds, + &mgs_client, + dir, + ) + .await + } + .boxed() } - } - }; - let n_collected = - self.sp_ereports_collected.load(Ordering::Acquire); - report.sp_ereports = match sp - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: SP ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + }), + ), + ( + "spawn steps to query all sp dumps", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .spawn_sp_dump_collection(&mgs_client, dir) + .await + } + .boxed() } - } - }; + }), + ), + ( + "spawn steps to query all sleds", + Box::new({ + let all_sleds = all_sleds.clone(); + move |collection, _| { + async move { + collection.spawn_query_all_sleds(&all_sleds).await + } + .boxed() + } + }), + ), + ]; + + Ok(self.run_collect_bundle_steps(dir, steps).await) + } + + async fn spawn_query_all_sleds( + &self, + all_sleds: &OnceCell>>>, + ) -> anyhow::Result { + let Some(all_sleds) = + self.get_or_initialize_all_sleds(all_sleds).await.as_deref() + else { + warn!(self.log, "Could not read list of sleds"); + return Ok(CollectionStepOutput::None); + }; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sled in all_sleds { + extra_steps.push(( + "sled data", + Box::new({ + let sled = sled.clone(); + move |collection, dir| { + async move { + collection.collect_data_from_sled(&sled, dir).await + } + .boxed() + } + }), + )); } - Ok(report) + + return Ok(CollectionStepOutput::SpawnSleds { extra_steps }); } // Collect data from a sled, storing it into a directory that will @@ -880,7 +1134,7 @@ impl BundleCollection { &self, sled: &nexus_db_model::Sled, dir: &Utf8Path, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let log = &self.log; info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); let sled_path = dir @@ -893,7 +1147,7 @@ impl BundleCollection { .await?; if self.request.skip_sled_info { - return Ok(()); + return Ok(CollectionStepOutput::None); } let Ok(sled_client) = nexus_networking::sled_client( @@ -909,7 +1163,7 @@ impl BundleCollection { "Could not contact sled", ) .await?; - return Ok(()); + return Ok(CollectionStepOutput::None); }; // NB: As new sled-diagnostic commands are added they should @@ -1014,14 +1268,15 @@ impl BundleCollection { error!(&self.log, "failed to write logs output: {e}"); } } - return Ok(()); + return Ok(CollectionStepOutput::None); } - async fn collect_sp_ereports( - self: Arc, + async fn save_host_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1029,40 +1284,50 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .sp_ereports_fetch_matching( + .host_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for SP ereports") + ( + reports, + e.internal_context( + "failed to query for host OS ereports", + ) + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); - let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.sp_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); + debug!( + self.log, + "Support bundle: added {n_ereports} host OS ereports" + ); } info!( self.log, - "Support bundle: collected {} total SP ereports", - self.sp_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total host ereports", reports ); - Ok(()) + Ok(reports) } - async fn collect_host_ereports( - self: Arc, + async fn save_sp_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1070,35 +1335,37 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .host_ereports_fetch_matching( + .sp_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for host OS ereports") + ( + reports, + e.internal_context("failed to query for SP ereports") + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.host_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!( - self.log, - "Support bundle: added {n_ereports} host OS ereports" - ); + debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); } info!( self.log, - "Support bundle: collected {} total host ereports", - self.host_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total SP ereports", reports ); - Ok(()) + Ok(reports) } async fn create_mgs_client(&self) -> anyhow::Result { @@ -1396,40 +1663,6 @@ where Ok(()) } -/// Collect task dumps from all SPs via MGS and save them to a directory. -async fn save_all_sp_dumps( - log: &slog::Logger, - mgs_client: &MgsClient, - sp_dumps_dir: &Utf8Path, -) -> anyhow::Result<()> { - let available_sps = get_available_sps(&mgs_client).await?; - - let mut tasks = ParallelTaskSet::new(); - for sp in available_sps { - let mgs_client = mgs_client.clone(); - let sp_dumps_dir = sp_dumps_dir.to_owned(); - - tasks - .spawn(async move { - save_sp_dumps(mgs_client, sp, sp_dumps_dir) - .await - .with_context(|| format!("SP {} {}", sp.type_, sp.slot)) - }) - .await; - } - for result in tasks.join_all().await { - if let Err(e) = result { - error!( - log, - "failed to capture task dumps"; - "error" => InlineErrorChain::new(e.as_ref()) - ); - } - } - - Ok(()) -} - /// Use MGS ignition info to find active SPs. async fn get_available_sps( mgs_client: &MgsClient, @@ -1455,9 +1688,9 @@ async fn get_available_sps( /// Fetch and save task dumps from a single SP. async fn save_sp_dumps( - mgs_client: MgsClient, + mgs_client: &MgsClient, sp: SpIdentifier, - sp_dumps_dir: Utf8PathBuf, + sp_dumps_dir: &Utf8Path, ) -> anyhow::Result<()> { let dump_count = mgs_client .sp_task_dump_count(&sp.type_, sp.slot) @@ -1488,10 +1721,10 @@ async fn save_sp_dumps( /// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier /// identification of sleds present in a bundle. -async fn write_sled_info( +async fn write_sled_cubby_info( log: &slog::Logger, mgs_client: &MgsClient, - nexus_sleds: Option<&[Sled]>, + nexus_sleds: &[Sled], dir: &Utf8Path, ) -> anyhow::Result<()> { #[derive(Serialize)] @@ -1506,7 +1739,6 @@ async fn write_sled_info( // We can still get a useful mapping of cubby to serial using just the data from MGS. let mut nexus_map: BTreeMap<_, _> = nexus_sleds - .unwrap_or_default() .into_iter() .map(|sled| (sled.serial_number(), sled)) .collect(); From 5abe573ba3ab32dd68dc74afb29691b9b6d4636e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Oct 2025 16:06:33 -0700 Subject: [PATCH 2/8] [support bundle] More structured data filtering --- .../tasks/support_bundle_collector.rs | 135 +++++++++++++++--- 1 file changed, 115 insertions(+), 20 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index ee2224edba8..1e29e05b2d2 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -55,6 +55,7 @@ use serde_json::json; use sha2::{Digest, Sha256}; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::collections::HashSet; use std::future::Future; use std::io::Write; use std::num::NonZeroU64; @@ -82,26 +83,83 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id)) } +// Describes how support bundle data is selected. +// +// Multiple values of this enum are joined together into a HashSet. +// Categories should be additive. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +enum BundleDataCategory { + Reconfigurator, + HostInfo, + SledCubbyInfo, + SpDumps, +} + +// The set of sleds to include +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +enum SledSelection { + All, + Specific(SledUuid), +} + // Specifies the data to be collected within the Support Bundle. #[derive(Clone)] struct BundleRequest { - // If "false": Skip collecting host-specific info from each sled. - skip_sled_info: bool, - // The size of chunks to use when transferring a bundle from Nexus // to a sled agent. // // Typically, this is CHUNK_SIZE, but can be modified for testing. transfer_chunk_size: NonZeroU64, + // The set of data to be included within this bundle. + data_selection: HashSet, + + // The set of sets to be included within this bundle. + // + // NOTE: This selection is only considered if "data_selection" requests + // data from specific sleds. + sled_selection: HashSet, + + // The set of ereports to be included within this bundle. + // + // "None" causes ereports to be skipped. ereport_query: Option, } +impl BundleRequest { + fn include_reconfigurator_data(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::Reconfigurator) + } + + fn include_host_info(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::HostInfo) + } + + fn include_sled(&self, id: SledUuid) -> bool { + self.sled_selection.contains(&SledSelection::Specific(id)) + || self.sled_selection.contains(&SledSelection::All) + } + + fn include_sled_cubby_info(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::SledCubbyInfo) + } + + fn include_sp_dumps(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::SpDumps) + } +} + impl Default for BundleRequest { fn default() -> Self { Self { - skip_sled_info: false, transfer_chunk_size: CHUNK_SIZE, + data_selection: HashSet::from([ + BundleDataCategory::Reconfigurator, + BundleDataCategory::HostInfo, + BundleDataCategory::SledCubbyInfo, + BundleDataCategory::SpDumps, + ]), + sled_selection: HashSet::from([SledSelection::All]), ereport_query: Some(EreportFilters { start_time: Some(chrono::Utc::now() - chrono::Days::new(7)), ..EreportFilters::default() @@ -771,6 +829,10 @@ impl BundleCollection { &self, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_reconfigurator_data() { + return Ok(CollectionStepOutput::None); + } + // Collect reconfigurator state const NMAX_BLUEPRINTS: usize = 300; match reconfigurator_state_load( @@ -920,6 +982,10 @@ impl BundleCollection { mgs_client: &OnceCell>>, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sled_cubby_info() { + return Ok(CollectionStepOutput::None); + } + let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { @@ -945,6 +1011,10 @@ impl BundleCollection { mgs_client: &OnceCell>>, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sp_dumps() { + return Ok(CollectionStepOutput::None); + } + let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { @@ -984,6 +1054,10 @@ impl BundleCollection { sp: SpIdentifier, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sp_dumps() { + return Ok(CollectionStepOutput::None); + } + save_sp_dumps(mgs_client, sp, dir) .await .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; @@ -1098,6 +1172,10 @@ impl BundleCollection { &self, all_sleds: &OnceCell>>>, ) -> anyhow::Result { + if !self.request.include_host_info() { + return Ok(CollectionStepOutput::None); + } + let Some(all_sleds) = self.get_or_initialize_all_sleds(all_sleds).await.as_deref() else { @@ -1107,6 +1185,10 @@ impl BundleCollection { let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; for sled in all_sleds { + if !self.request.include_sled(sled.id()) { + continue; + } + extra_steps.push(( "sled data", Box::new({ @@ -1135,6 +1217,12 @@ impl BundleCollection { sled: &nexus_db_model::Sled, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_host_info() + || !self.request.include_sled(sled.id()) + { + return Ok(CollectionStepOutput::None); + } + let log = &self.log; info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); let sled_path = dir @@ -1146,10 +1234,6 @@ impl BundleCollection { tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) .await?; - if self.request.skip_sled_info { - return Ok(CollectionStepOutput::None); - } - let Ok(sled_client) = nexus_networking::sled_client( &self.datastore, &self.opctx, @@ -2266,7 +2350,7 @@ mod test { let request = BundleRequest { // NOTE: The support bundle querying interface isn't supported on // the simulated sled agent (yet?) so we're skipping this step. - skip_sled_info: true, + sled_selection: HashSet::new(), ..Default::default() }; let report = collector @@ -2340,9 +2424,10 @@ mod test { // We're going to use a really small chunk size here to force the bundle // to get split up. let request = BundleRequest { - skip_sled_info: true, transfer_chunk_size: NonZeroU64::new(16).unwrap(), + sled_selection: HashSet::new(), ereport_query: None, + ..Default::default() }; let report = collector @@ -2430,8 +2515,10 @@ mod test { ); // Each time we call "collect_bundle", we collect a SINGLE bundle. - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2579,8 +2666,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2726,8 +2815,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2811,8 +2902,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2897,8 +2990,10 @@ mod test { ); // Collect the bundle - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await From 0cb9a6a303da78f39eedd1d4fd230ca264bb4371 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 Dec 2025 16:08:36 -0800 Subject: [PATCH 3/8] docs --- .../src/app/background/tasks/support_bundle_collector.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 9692db353b4..9a1cc86f909 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -90,13 +90,22 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { // Categories should be additive. #[derive(Debug, Clone, Hash, Eq, PartialEq)] enum BundleDataCategory { + // Collects reconfigurator state (some of the latest blueprints, + // information about the target blueprint). Reconfigurator, + // Collects info from sled agents, running a handful of + // diagnostic commands (e.g., zoneadm, dladm, etc). HostInfo, + // Collects sled serial numbers, cubby numbers, and UUIDs. SledCubbyInfo, + // Saves task dumps from SPs. SpDumps, } // The set of sleds to include +// +// Multiple values of this enum are joined together into a HashSet. +// Therefore "SledSelection::All" overrides specific sleds. #[derive(Debug, Clone, Hash, Eq, PartialEq)] enum SledSelection { All, From 015f2e971641b59692839fcc3ae03f4406316c7e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 11:00:20 -0800 Subject: [PATCH 4/8] meh --- nexus/src/app/background/tasks/support_bundle_collector.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 9a1cc86f909..0adc94f37e1 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1095,7 +1095,7 @@ impl BundleCollection { // Perform the work of collecting the support bundle into a temporary directory // - // "dir" is a directory where data can be stored. + // "dir" is an output directory where data can be stored. // // If a partial bundle can be collected, it should be returned as // an Ok(SupportBundleCollectionReport). Any failures from this function From 92415e172d4436e920533d548d1b1fa1bea92db1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 11:24:46 -0800 Subject: [PATCH 5/8] Improve the support bundle report on a step-by-step basis --- dev-tools/omdb/src/bin/omdb/nexus.rs | 15 ++ .../tasks/support_bundle_collector.rs | 180 ++++++++++++------ .../integration_tests/support_bundles.rs | 43 ++--- nexus/types/src/internal_api/background.rs | 31 +++ 4 files changed, 186 insertions(+), 83 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 6a842a1d7e7..d8d6bab1bc7 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -98,6 +98,7 @@ use std::fs::OpenOptions; use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use support_bundle_viewer::LocalFileAccess; use support_bundle_viewer::SupportBundleAccessor; use tabled::Tabled; @@ -2612,6 +2613,7 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { listed_in_service_sleds, listed_sps, activated_in_db_ok, + mut steps, ereports, }) = collection_report { @@ -2623,6 +2625,19 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { println!( " Bundle was able to list service processors: {listed_sps}" ); + + steps.sort_unstable_by_key(|s| s.start); + for step in steps { + let duration = (step.end - step.start) + .to_std() + .unwrap_or(Duration::from_millis(0)); + println!( + " Step {} ({}ms): {}", + step.name, + duration.as_millis(), + step.status + ); + } println!( " Bundle was activated in the database: {activated_in_db_ok}" ); diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0adc94f37e1..2b63ac6e93b 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,6 +6,7 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use anyhow::bail; use base64::Engine; use camino::Utf8DirEntry; use camino::Utf8Path; @@ -13,6 +14,8 @@ use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempdir_in; use camino_tempfile::tempfile_in; +use chrono::DateTime; +use chrono::Utc; use futures::FutureExt; use futures::StreamExt; use futures::future::BoxFuture; @@ -38,6 +41,8 @@ use nexus_types::fm::Ereport; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; +use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -563,47 +568,115 @@ type CollectionStepFn = Box< + Send, >; -enum CollectionStepOutput { - Ereports(SupportBundleEreportStatus), - SavingSpDumps { listed_sps: bool }, - // NOTE: The distinction between this and "Spawn" is pretty artificial - - // it's just to preserve a part of the report which says "we tried to - // list in-service sleds". - // - // If we changed the collection report, this could easily be combined - // with the "Spawn" variant. - SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - None, +struct CollectionStep { + name: String, + step_fn: CollectionStepFn, +} + +impl CollectionStep { + fn new(name: impl Into, step_fn: CollectionStepFn) -> Self { + Self { name: name.into(), step_fn } + } + + async fn run( + self, + collection: &Arc, + output: &Utf8Path, + ) -> CompletedCollectionStep { + let start = Utc::now(); + + let output = (self.step_fn)(collection, output) + .await + .inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "name" => &self.name, + InlineErrorChain::new(err.as_ref()), + ); + }) + .unwrap_or_else(|err| CollectionStepOutput::Failed(err)); + + let end = Utc::now(); + + CompletedCollectionStep { name: self.name, start, end, output } + } +} + +struct CompletedCollectionStep { + name: String, + start: DateTime, + end: DateTime, + output: CollectionStepOutput, } -impl CollectionStepOutput { +impl CompletedCollectionStep { // Updates the collection report based on the output of a collection step, // and possibly extends the set of all steps to be executed. fn process( self, report: &mut SupportBundleCollectionReport, - steps: &mut Vec<(&'static str, CollectionStepFn)>, + steps: &mut Vec, ) { - match self { + use SupportBundleCollectionStepStatus as Status; + + let status = match self.output { + CollectionStepOutput::Skipped => Status::Skipped, + CollectionStepOutput::Failed(err) => { + Status::Failed(err.to_string()) + } CollectionStepOutput::Ereports(status) => { report.ereports = Some(status); + Status::Ok } CollectionStepOutput::SavingSpDumps { listed_sps } => { report.listed_sps = listed_sps; + Status::Ok } CollectionStepOutput::SpawnSleds { extra_steps } => { report.listed_in_service_sleds = true; steps.extend(extra_steps); + Status::Ok } CollectionStepOutput::Spawn { extra_steps } => { steps.extend(extra_steps); + Status::Ok } - CollectionStepOutput::None => (), - } + CollectionStepOutput::None => Status::Ok, + }; + + // Add information about this completed step the bundle report. + let step = SupportBundleCollectionStep { + name: self.name, + start: self.start, + end: self.end, + status, + }; + report.steps.push(step); } } +enum CollectionStepOutput { + // The step was not executed intentionally + Skipped, + // The step encountered a fatal error and could not complete. + // + // It may have still saved a partial set of data to the bundle. + Failed(anyhow::Error), + Ereports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The distinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec }, + Spawn { extra_steps: Vec }, + // The step completed with nothing to report, and no follow-up steps + None, +} + impl BundleCollection { // Collect the bundle within Nexus, and store it on a target sled. async fn collect_bundle_and_store_on_sled( @@ -856,7 +929,7 @@ impl BundleCollection { async fn run_collect_bundle_steps( self: &Arc, output: &Utf8TempDir, - mut steps: Vec<(&'static str, CollectionStepFn)>, + mut steps: Vec, ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); @@ -867,34 +940,25 @@ impl BundleCollection { loop { // Process all the currently-planned steps - while let Some((step_name, step)) = steps.pop() { + while let Some(step) = steps.pop() { let previous_result = tasks.spawn({ let collection = self.clone(); let dir = output.path().to_path_buf(); async move { - debug!(collection.log, "Running step"; "name" => &step_name); - step(&collection, dir.as_path()).await.inspect_err(|err| { - warn!( - collection.log, - "Step failed"; - "name" => &step_name, - InlineErrorChain::new(err.as_ref()), - ); - }) + debug!(collection.log, "Running step"; "name" => &step.name); + step.run(&collection, dir.as_path()).await } }).await; - if let Some(Ok(output)) = previous_result { + if let Some(output) = previous_result { output.process(&mut report, &mut steps); }; } // If we've run out of tasks to spawn, join any of the previously // spawned tasks, if any exist. - if let Some(previous_result) = tasks.join_next().await { - if let Ok(output) = previous_result { - output.process(&mut report, &mut steps); - }; + if let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); // As soon as any task completes, see if we can spawn more work // immediately. This ensures that the ParallelTaskSet is @@ -926,7 +990,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_reconfigurator_data() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } // Collect reconfigurator state @@ -1011,17 +1075,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sled_cubby_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!( - self.log, - "No MGS client, skipping sled cubby info collection" - ); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let nexus_sleds = self .get_or_initialize_all_sleds(all_sleds) @@ -1040,14 +1100,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!(self.log, "No MGS client, skipping SP task dump collection"); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let sp_dumps_dir = dir.join("sp_task_dumps"); @@ -1055,9 +1114,9 @@ impl BundleCollection { format!("Failed to create SP task dump directory {sp_dumps_dir}") })?; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sp in get_available_sps(&mgs_client).await? { - extra_steps.push(( + extra_steps.push(CollectionStep::new( "SP dump", Box::new({ let mgs_client = mgs_client.clone(); @@ -1083,7 +1142,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { @@ -1124,26 +1183,26 @@ impl BundleCollection { // Shared, lazy, fallible initialization for MGS client let mgs_client: OnceCell>> = OnceCell::new(); - let steps: Vec<(&str, CollectionStepFn)> = vec![ - ( + let steps: Vec = vec![ + CollectionStep::new( "bundle id", Box::new(|collection, dir| { collection.collect_bundle_id(dir).boxed() }), ), - ( + CollectionStep::new( "reconfigurator state", Box::new(|collection, dir| { collection.collect_reconfigurator_state(dir).boxed() }), ), - ( + CollectionStep::new( "ereports", Box::new(|collection, dir| { collection.collect_ereports(dir).boxed() }), ), - ( + CollectionStep::new( "sled cubby info", Box::new({ let all_sleds = all_sleds.clone(); @@ -1162,7 +1221,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all SP dumps", Box::new({ let mgs_client = mgs_client.clone(); @@ -1176,7 +1235,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all sleds", Box::new({ let all_sleds = all_sleds.clone(); @@ -1198,23 +1257,22 @@ impl BundleCollection { all_sleds: &OnceCell>>>, ) -> anyhow::Result { if !self.request.include_host_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(all_sleds) = self.get_or_initialize_all_sleds(all_sleds).await.as_deref() else { - warn!(self.log, "Could not read list of sleds"); - return Ok(CollectionStepOutput::None); + bail!("Could not read list of sleds"); }; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sled in all_sleds { if !self.request.include_sled(sled.id()) { continue; } - extra_steps.push(( + extra_steps.push(CollectionStep::new( "sled data", Box::new({ let sled = sled.clone(); @@ -1245,7 +1303,7 @@ impl BundleCollection { if !self.request.include_host_info() || !self.request.include_sled(sled.id()) { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let log = &self.log; @@ -1272,7 +1330,7 @@ impl BundleCollection { "Could not contact sled", ) .await?; - return Ok(CollectionStepOutput::None); + bail!("Could not contact sled"); }; // NB: As new sled-diagnostic commands are added they should @@ -1386,7 +1444,7 @@ impl BundleCollection { ) -> anyhow::Result { let Some(ref ereport_filters) = self.request.ereport_query else { debug!(self.log, "Support bundle: ereports not requested"); - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); }; let ereports_dir = dir.join("ereports"); let mut status = SupportBundleEreportStatus::default(); diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 716bc228ca9..75bb4bfa64d 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -486,18 +486,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { output.cleanup_report, Some(SupportBundleCleanupReport { ..Default::default() }) ); + + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); @@ -588,18 +588,17 @@ async fn test_support_bundle_range_requests( // Finish collection, activate the bundle. let output = activate_bundle_collection_background_task(&cptestctx).await; assert_eq!(output.collection_err, None); + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 08d343182e6..42264d0411b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -281,11 +281,41 @@ pub struct SupportBundleCollectionReport { /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, + /// All steps taken, alongside their timing information, when collecting the + /// bundle. + pub steps: Vec, + /// Status of ereport collection, or `None` if no ereports were requested /// for this support bundle. pub ereports: Option, } +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct SupportBundleCollectionStep { + pub name: String, + pub start: DateTime, + pub end: DateTime, + pub status: SupportBundleCollectionStepStatus, +} + +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SupportBundleCollectionStepStatus { + Ok, + Skipped, + Failed(String), +} + +impl std::fmt::Display for SupportBundleCollectionStepStatus { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + use SupportBundleCollectionStepStatus::*; + match self { + Ok => write!(f, "ok"), + Skipped => write!(f, "skipped"), + Failed(why) => write!(f, "failed: {why}"), + } + } +} + #[derive(Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct SupportBundleEreportStatus { /// The total number of ereports found that match the requested filters. @@ -309,6 +339,7 @@ impl SupportBundleCollectionReport { listed_in_service_sleds: false, listed_sps: false, activated_in_db_ok: false, + steps: vec![], ereports: None, } } From 4d59114117248febe5ca8a1aa8981fa9f0d7b289 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 17:11:36 -0800 Subject: [PATCH 6/8] Better table output, step labels --- dev-tools/omdb/src/bin/omdb/nexus.rs | 36 +++++++++++++------ .../tasks/support_bundle_collector.rs | 4 +-- .../integration_tests/support_bundles.rs | 22 ++++++++++++ 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index d8d6bab1bc7..9833a7bf141 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2626,17 +2626,33 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { " Bundle was able to list service processors: {listed_sps}" ); + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct StepRow { + step_name: String, + start_time: String, + duration: String, + status: String, + } + steps.sort_unstable_by_key(|s| s.start); - for step in steps { - let duration = (step.end - step.start) - .to_std() - .unwrap_or(Duration::from_millis(0)); - println!( - " Step {} ({}ms): {}", - step.name, - duration.as_millis(), - step.status - ); + let rows: Vec = steps + .into_iter() + .map(|step| { + let duration = (step.end - step.start) + .to_std() + .unwrap_or(Duration::from_millis(0)); + StepRow { + step_name: step.name, + start_time: step.start.to_rfc3339(), + duration: format!("{:.3}s", duration.as_secs_f64()), + status: step.status.to_string(), + } + }) + .collect(); + + if !rows.is_empty() { + println!("\n{}", tabled::Table::new(rows)); } println!( " Bundle was activated in the database: {activated_in_db_ok}" diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 2b63ac6e93b..41babc3838b 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1117,7 +1117,7 @@ impl BundleCollection { let mut extra_steps: Vec = vec![]; for sp in get_available_sps(&mgs_client).await? { extra_steps.push(CollectionStep::new( - "SP dump", + format!("SP dump for {:?}", sp), Box::new({ let mgs_client = mgs_client.clone(); move |collection, dir| { @@ -1273,7 +1273,7 @@ impl BundleCollection { } extra_steps.push(CollectionStep::new( - "sled data", + format!("sled data for sled {}", sled.id()), Box::new({ let sled = sled.clone(); move |collection, dir| { diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 75bb4bfa64d..ade2cbdb2c9 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -500,6 +500,17 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -601,6 +612,17 @@ async fn test_support_bundle_range_requests( errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); From e69f392cae63f102cf5ff40752db914eaeda1666 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 Dec 2025 12:58:08 -0800 Subject: [PATCH 7/8] [support bundle] Simplify report, relying on new 'steps' infrastructure --- dev-tools/omdb/src/bin/omdb/nexus.rs | 8 -- .../tasks/support_bundle_collector.rs | 94 ++++++++++++------- .../integration_tests/support_bundles.rs | 29 +++++- nexus/types/src/internal_api/background.rs | 21 +++-- 4 files changed, 97 insertions(+), 55 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 9833a7bf141..b17751821a3 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2610,8 +2610,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { if let Some(SupportBundleCollectionReport { bundle, - listed_in_service_sleds, - listed_sps, activated_in_db_ok, mut steps, ereports, @@ -2619,12 +2617,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { { println!(" Support Bundle Collection Report:"); println!(" Bundle ID: {bundle}"); - println!( - " Bundle was able to list in-service sleds: {listed_in_service_sleds}" - ); - println!( - " Bundle was able to list service processors: {listed_sps}" - ); #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 41babc3838b..5eb9b6f6370 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -629,15 +629,6 @@ impl CompletedCollectionStep { report.ereports = Some(status); Status::Ok } - CollectionStepOutput::SavingSpDumps { listed_sps } => { - report.listed_sps = listed_sps; - Status::Ok - } - CollectionStepOutput::SpawnSleds { extra_steps } => { - report.listed_in_service_sleds = true; - steps.extend(extra_steps); - Status::Ok - } CollectionStepOutput::Spawn { extra_steps } => { steps.extend(extra_steps); Status::Ok @@ -664,14 +655,7 @@ enum CollectionStepOutput { // It may have still saved a partial set of data to the bundle. Failed(anyhow::Error), Ereports(SupportBundleEreportStatus), - SavingSpDumps { listed_sps: bool }, - // NOTE: The distinction between this and "Spawn" is pretty artificial - - // it's just to preserve a part of the report which says "we tried to - // list in-service sleds". - // - // If we changed the collection report, this could easily be combined - // with the "Spawn" variant. - SpawnSleds { extra_steps: Vec }, + // The step spawned additional steps to execute Spawn { extra_steps: Vec }, // The step completed with nothing to report, and no follow-up steps None, @@ -1149,7 +1133,7 @@ impl BundleCollection { format!("failed to save SP dump from: {} {}", sp.type_, sp.slot) })?; - Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) + Ok(CollectionStepOutput::None) } // Perform the work of collecting the support bundle into a temporary directory @@ -1185,25 +1169,25 @@ impl BundleCollection { let steps: Vec = vec![ CollectionStep::new( - "bundle id", + SupportBundleCollectionStep::STEP_BUNDLE_ID, Box::new(|collection, dir| { collection.collect_bundle_id(dir).boxed() }), ), CollectionStep::new( - "reconfigurator state", + SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE, Box::new(|collection, dir| { collection.collect_reconfigurator_state(dir).boxed() }), ), CollectionStep::new( - "ereports", + SupportBundleCollectionStep::STEP_EREPORTS, Box::new(|collection, dir| { collection.collect_ereports(dir).boxed() }), ), CollectionStep::new( - "sled cubby info", + SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO, Box::new({ let all_sleds = all_sleds.clone(); let mgs_client = mgs_client.clone(); @@ -1222,7 +1206,7 @@ impl BundleCollection { }), ), CollectionStep::new( - "spawn steps to query all SP dumps", + SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS, Box::new({ let mgs_client = mgs_client.clone(); move |collection, dir| { @@ -1236,7 +1220,7 @@ impl BundleCollection { }), ), CollectionStep::new( - "spawn steps to query all sleds", + SupportBundleCollectionStep::STEP_SPAWN_SLEDS, Box::new({ let all_sleds = all_sleds.clone(); move |collection, _| { @@ -1286,7 +1270,7 @@ impl BundleCollection { )); } - return Ok(CollectionStepOutput::SpawnSleds { extra_steps }); + return Ok(CollectionStepOutput::Spawn { extra_steps }); } // Collect data from a sled, storing it into a directory that will @@ -2425,8 +2409,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -2502,8 +2494,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); let observed_bundle = datastore @@ -2591,8 +2591,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle1.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // This is observable by checking the state of bundle1 and bundle2: @@ -2614,8 +2622,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle2.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // After another collection request, we'll see that both bundles have @@ -2742,8 +2758,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // Cancel the bundle after collection has completed diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index ade2cbdb2c9..80ec8af191f 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -19,6 +19,7 @@ use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; @@ -489,8 +490,6 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { let report = output.collection_report.as_ref().expect("Missing report"); assert_eq!(report.bundle, bundle.id); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -511,6 +510,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { ); } + // Verify that we successfully spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS), + "Should have attempted to list in-service sleds" + ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), + "Should have attempted to list service processors" + ); + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -601,8 +612,6 @@ async fn test_support_bundle_range_requests( assert_eq!(output.collection_err, None); let report = output.collection_report.as_ref().expect("Missing report"); assert_eq!(report.bundle, bundle.id); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -623,6 +632,18 @@ async fn test_support_bundle_range_requests( ); } + // Verify that we successfully spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS), + "Should have attempted to list in-service sleds" + ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), + "Should have attempted to list service processors" + ); + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 42264d0411b..dfe008198f9 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -272,12 +272,6 @@ pub struct SupportBundleCleanupReport { pub struct SupportBundleCollectionReport { pub bundle: SupportBundleUuid, - /// True iff we could list in-service sleds - pub listed_in_service_sleds: bool, - - /// True iff we could list the service processors. - pub listed_sps: bool, - /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, @@ -298,6 +292,19 @@ pub struct SupportBundleCollectionStep { pub status: SupportBundleCollectionStepStatus, } +impl SupportBundleCollectionStep { + /// Step name constants for the main collection steps. + /// + /// These are used both when creating steps and when validating in tests. + pub const STEP_BUNDLE_ID: &'static str = "bundle id"; + pub const STEP_RECONFIGURATOR_STATE: &'static str = "reconfigurator state"; + pub const STEP_EREPORTS: &'static str = "ereports"; + pub const STEP_SLED_CUBBY_INFO: &'static str = "sled cubby info"; + pub const STEP_SPAWN_SP_DUMPS: &'static str = + "spawn steps to query all SP dumps"; + pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds"; +} + #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] pub enum SupportBundleCollectionStepStatus { Ok, @@ -336,8 +343,6 @@ impl SupportBundleCollectionReport { pub fn new(bundle: SupportBundleUuid) -> Self { Self { bundle, - listed_in_service_sleds: false, - listed_sps: false, activated_in_db_ok: false, steps: vec![], ereports: None, From 1886293ab4d50633bea50058e848bdf70ba1f64c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 Dec 2025 13:25:00 -0800 Subject: [PATCH 8/8] improve comment --- nexus/src/app/background/tasks/support_bundle_collector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0adc94f37e1..34aa31862c4 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -124,10 +124,10 @@ struct BundleRequest { // The set of data to be included within this bundle. data_selection: HashSet, - // The set of sets to be included within this bundle. + // The set of sleds to be included within this bundle. // // NOTE: This selection is only considered if "data_selection" requests - // data from specific sleds. + // data from sleds. sled_selection: HashSet, // The set of ereports to be included within this bundle.