diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 6a842a1d7e7..23185f4bcef 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -72,6 +72,7 @@ use nexus_types::internal_api::background::SitrepGcStatus; use nexus_types::internal_api::background::SitrepLoadStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; use nexus_types::internal_api::background::SupportBundleEreportStatus; use nexus_types::internal_api::background::TufArtifactReplicationCounters; use nexus_types::internal_api::background::TufArtifactReplicationRequest; @@ -98,6 +99,7 @@ use std::fs::OpenOptions; use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use support_bundle_viewer::LocalFileAccess; use support_bundle_viewer::SupportBundleAccessor; use tabled::Tabled; @@ -2612,6 +2614,7 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { listed_in_service_sleds, listed_sps, activated_in_db_ok, + mut steps, ereports, }) = collection_report { @@ -2623,6 +2626,35 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { println!( " Bundle was able to list service processors: {listed_sps}" ); + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct StepRow { + step_name: String, + start_time: DateTime, + duration: String, + status: SupportBundleCollectionStepStatus, + } + + steps.sort_unstable_by_key(|s| s.start); + let rows: Vec = steps + .into_iter() + .map(|step| { + let duration = (step.end - step.start) + .to_std() + .unwrap_or(Duration::from_millis(0)); + StepRow { + step_name: step.name, + start_time: step.start, + duration: format!("{:.3}s", duration.as_secs_f64()), + status: step.status, + } + }) + .collect(); + + if !rows.is_empty() { + println!("\n{}", tabled::Table::new(rows)); + } println!( " Bundle was activated in the database: {activated_in_db_ok}" ); diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index aeae2867b78..bbad16aa165 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,6 +6,7 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use anyhow::bail; use base64::Engine; use camino::Utf8DirEntry; use camino::Utf8Path; @@ -13,6 +14,8 @@ use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempdir_in; use camino_tempfile::tempfile_in; +use chrono::DateTime; +use chrono::Utc; use futures::FutureExt; use futures::StreamExt; use futures::future::BoxFuture; @@ -38,6 +41,8 @@ use nexus_types::fm::Ereport; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; +use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -646,47 +651,115 @@ type CollectionStepFn = Box< + Send, >; -enum CollectionStepOutput { - Ereports(SupportBundleEreportStatus), - SavingSpDumps { listed_sps: bool }, - // NOTE: The distinction between this and "Spawn" is pretty artificial - - // it's just to preserve a part of the report which says "we tried to - // list in-service sleds". - // - // If we changed the collection report, this could easily be combined - // with the "Spawn" variant. - SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - None, +struct CollectionStep { + name: String, + step_fn: CollectionStepFn, +} + +impl CollectionStep { + fn new(name: impl Into, step_fn: CollectionStepFn) -> Self { + Self { name: name.into(), step_fn } + } + + async fn run( + self, + collection: &Arc, + output: &Utf8Path, + ) -> CompletedCollectionStep { + let start = Utc::now(); + + let output = (self.step_fn)(collection, output) + .await + .inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "step" => &self.name, + InlineErrorChain::new(err.as_ref()), + ); + }) + .unwrap_or_else(|err| CollectionStepOutput::Failed(err)); + + let end = Utc::now(); + + CompletedCollectionStep { name: self.name, start, end, output } + } } -impl CollectionStepOutput { +struct CompletedCollectionStep { + name: String, + start: DateTime, + end: DateTime, + output: CollectionStepOutput, +} + +impl CompletedCollectionStep { // Updates the collection report based on the output of a collection step, // and possibly extends the set of all steps to be executed. fn process( self, report: &mut SupportBundleCollectionReport, - steps: &mut Vec<(&'static str, CollectionStepFn)>, + steps: &mut Vec, ) { - match self { + use SupportBundleCollectionStepStatus as Status; + + let status = match self.output { + CollectionStepOutput::Skipped => Status::Skipped, + CollectionStepOutput::Failed(err) => { + Status::Failed(err.to_string()) + } CollectionStepOutput::Ereports(status) => { report.ereports = Some(status); + Status::Ok } CollectionStepOutput::SavingSpDumps { listed_sps } => { report.listed_sps = listed_sps; + Status::Ok } CollectionStepOutput::SpawnSleds { extra_steps } => { report.listed_in_service_sleds = true; steps.extend(extra_steps); + Status::Ok } CollectionStepOutput::Spawn { extra_steps } => { steps.extend(extra_steps); + Status::Ok } - CollectionStepOutput::None => (), - } + CollectionStepOutput::None => Status::Ok, + }; + + // Add information about this completed step the bundle report. + let step = SupportBundleCollectionStep { + name: self.name, + start: self.start, + end: self.end, + status, + }; + report.steps.push(step); } } +enum CollectionStepOutput { + // The step was not executed intentionally + Skipped, + // The step encountered a fatal error and could not complete. + // + // It may have still saved a partial set of data to the bundle. + Failed(anyhow::Error), + Ereports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The distinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec }, + Spawn { extra_steps: Vec }, + // The step completed with nothing to report, and no follow-up steps + None, +} + impl BundleCollection { // Collect the bundle within Nexus, and store it on a target sled. async fn collect_bundle_and_store_on_sled( @@ -939,7 +1012,7 @@ impl BundleCollection { async fn run_collect_bundle_steps( self: &Arc, output: &Utf8TempDir, - mut steps: Vec<(&'static str, CollectionStepFn)>, + mut steps: Vec, ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); @@ -950,34 +1023,25 @@ impl BundleCollection { loop { // Process all the currently-planned steps - while let Some((step_name, step)) = steps.pop() { + while let Some(step) = steps.pop() { let previous_result = tasks.spawn({ let collection = self.clone(); let dir = output.path().to_path_buf(); async move { - debug!(collection.log, "Running step"; "name" => &step_name); - step(&collection, dir.as_path()).await.inspect_err(|err| { - warn!( - collection.log, - "Step failed"; - "name" => &step_name, - InlineErrorChain::new(err.as_ref()), - ); - }) + debug!(collection.log, "Running step"; "step" => &step.name); + step.run(&collection, dir.as_path()).await } }).await; - if let Some(Ok(output)) = previous_result { + if let Some(output) = previous_result { output.process(&mut report, &mut steps); }; } // If we've run out of tasks to spawn, join any of the previously // spawned tasks, if any exist. - if let Some(previous_result) = tasks.join_next().await { - if let Ok(output) = previous_result { - output.process(&mut report, &mut steps); - }; + if let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); // As soon as any task completes, see if we can spawn more work // immediately. This ensures that the ParallelTaskSet is @@ -1009,7 +1073,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_reconfigurator_data() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } // Collect reconfigurator state @@ -1094,17 +1158,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sled_cubby_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!( - self.log, - "No MGS client, skipping sled cubby info collection" - ); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let nexus_sleds = self .get_or_initialize_all_sleds(all_sleds) @@ -1123,14 +1183,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!(self.log, "No MGS client, skipping SP task dump collection"); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let sp_dumps_dir = dir.join("sp_task_dumps"); @@ -1138,10 +1197,10 @@ impl BundleCollection { format!("Failed to create SP task dump directory {sp_dumps_dir}") })?; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sp in get_available_sps(&mgs_client).await? { - extra_steps.push(( - "SP dump", + extra_steps.push(CollectionStep::new( + format!("SP dump for {:?}", sp), Box::new({ let mgs_client = mgs_client.clone(); move |collection, dir| { @@ -1166,7 +1225,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { @@ -1207,26 +1266,26 @@ impl BundleCollection { // Shared, lazy, fallible initialization for MGS client let mgs_client: OnceCell>> = OnceCell::new(); - let steps: Vec<(&str, CollectionStepFn)> = vec![ - ( + let steps: Vec = vec![ + CollectionStep::new( "bundle id", Box::new(|collection, dir| { collection.collect_bundle_id(dir).boxed() }), ), - ( + CollectionStep::new( "reconfigurator state", Box::new(|collection, dir| { collection.collect_reconfigurator_state(dir).boxed() }), ), - ( + CollectionStep::new( "ereports", Box::new(|collection, dir| { collection.collect_ereports(dir).boxed() }), ), - ( + CollectionStep::new( "sled cubby info", Box::new({ let all_sleds = all_sleds.clone(); @@ -1245,7 +1304,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all SP dumps", Box::new({ let mgs_client = mgs_client.clone(); @@ -1259,7 +1318,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all sleds", Box::new({ let all_sleds = all_sleds.clone(); @@ -1281,24 +1340,23 @@ impl BundleCollection { all_sleds: &OnceCell>>>, ) -> anyhow::Result { if !self.request.include_host_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(all_sleds) = self.get_or_initialize_all_sleds(all_sleds).await.as_deref() else { - warn!(self.log, "Could not read list of sleds"); - return Ok(CollectionStepOutput::None); + bail!("Could not read list of sleds"); }; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sled in all_sleds { if !self.request.include_sled_host_info(sled.id()) { continue; } - extra_steps.push(( - "sled data", + extra_steps.push(CollectionStep::new( + format!("sled data for sled {}", sled.id()), Box::new({ let sled = sled.clone(); move |collection, dir| { @@ -1326,7 +1384,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sled_host_info(sled.id()) { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let log = &self.log; @@ -1340,20 +1398,25 @@ impl BundleCollection { tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) .await?; - let Ok(sled_client) = nexus_networking::sled_client( + let sled_client = match nexus_networking::sled_client( &self.datastore, &self.opctx, sled.id(), log, ) .await - else { - tokio::fs::write( - sled_path.join("error.txt"), - "Could not contact sled", - ) - .await?; - return Ok(CollectionStepOutput::None); + { + Ok(client) => client, + Err(err) => { + tokio::fs::write( + sled_path.join("error.txt"), + "Could not contact sled", + ) + .await.with_context(|| { + format!("Failed to save 'error.txt' to bundle when recording error: {err}") + })?; + bail!("Could not contact sled: {err}"); + } }; // NB: As new sled-diagnostic commands are added they should @@ -1467,7 +1530,7 @@ impl BundleCollection { ) -> anyhow::Result { let Some(ereport_filters) = self.request.get_ereport_filters() else { debug!(self.log, "Support bundle: ereports not requested"); - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); }; let ereports_dir = dir.join("ereports"); let mut status = SupportBundleEreportStatus::default(); diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 716bc228ca9..ade2cbdb2c9 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -486,20 +486,31 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { output.cleanup_report, Some(SupportBundleCleanupReport { ..Default::default() }) ); + + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -588,20 +599,30 @@ async fn test_support_bundle_range_requests( // Finish collection, activate the bundle. let output = activate_bundle_collection_background_task(&cptestctx).await; assert_eq!(output.collection_err, None); + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 08d343182e6..42264d0411b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -281,11 +281,41 @@ pub struct SupportBundleCollectionReport { /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, + /// All steps taken, alongside their timing information, when collecting the + /// bundle. + pub steps: Vec, + /// Status of ereport collection, or `None` if no ereports were requested /// for this support bundle. pub ereports: Option, } +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct SupportBundleCollectionStep { + pub name: String, + pub start: DateTime, + pub end: DateTime, + pub status: SupportBundleCollectionStepStatus, +} + +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SupportBundleCollectionStepStatus { + Ok, + Skipped, + Failed(String), +} + +impl std::fmt::Display for SupportBundleCollectionStepStatus { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + use SupportBundleCollectionStepStatus::*; + match self { + Ok => write!(f, "ok"), + Skipped => write!(f, "skipped"), + Failed(why) => write!(f, "failed: {why}"), + } + } +} + #[derive(Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct SupportBundleEreportStatus { /// The total number of ereports found that match the requested filters. @@ -309,6 +339,7 @@ impl SupportBundleCollectionReport { listed_in_service_sleds: false, listed_sps: false, activated_in_db_ok: false, + steps: vec![], ereports: None, } }