diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index be8d77012f..0ed987f0e4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(201, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(202, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(202, "lookup-bundle-by-state"), KnownVersion::new(201, "scim-client-bearer-token"), KnownVersion::new(200, "dual-stack-network-interfaces"), KnownVersion::new(199, "multicast-pool-support"), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 70894aa401..a00a0a128d 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -191,23 +191,34 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Lists one page of support bundles in a particular state, assigned to - /// a particular Nexus. + /// Lists one page of support bundles in a particular state. + /// + /// If `nexus_id` is not None, also filters for bundles assigned to a + /// particular Nexus. Otherwise, lists all bundles in the requested + /// states. pub async fn support_bundle_list_assigned_to_nexus( &self, opctx: &OpContext, pagparams: &DataPageParams<'_, Uuid>, - nexus_id: OmicronZoneUuid, + nexus_id: Option, states: Vec, ) -> ListResultVec { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use nexus_db_schema::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; - paginated(dsl::support_bundle, dsl::id, pagparams) - .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + let query = paginated(dsl::support_bundle, dsl::id, pagparams) .filter(dsl::state.eq_any(states)) - .order(dsl::time_created.asc()) + .order(dsl::time_created.asc()); + + let query = match nexus_id { + Some(id) => { + query.filter(dsl::assigned_nexus.eq(id.into_untyped_uuid())) + } + None => query, + }; + + query .select(SupportBundle::as_select()) .load_async(&*conn) .await @@ -324,8 +335,9 @@ impl DataStore { .execute_async(conn) .await?; - // Find all bundles on nexuses that no longer exist. + // Find all collecting bundles on nexuses that no longer exist. let bundles_with_bad_nexuses = dsl::support_bundle + .filter(dsl::state.eq(SupportBundleState::Collecting)) .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) .select(SupportBundle::as_select()) .load_async(conn) @@ -704,7 +716,7 @@ mod test { .support_bundle_list_assigned_to_nexus( &opctx, &pagparams, - nexus_a, + Some(nexus_a), vec![SupportBundleState::Collecting] ) .await @@ -732,7 +744,7 @@ mod test { .support_bundle_list_assigned_to_nexus( &opctx, &pagparams, - nexus_a, + Some(nexus_a), vec![SupportBundleState::Collecting] ) .await @@ -747,7 +759,7 @@ mod test { .support_bundle_list_assigned_to_nexus( &opctx, &pagparams, - nexus_b, + Some(nexus_b), vec![SupportBundleState::Collecting] ) .await @@ -776,7 +788,7 @@ mod test { .support_bundle_list_assigned_to_nexus( &opctx, &pagparams, - nexus_a, + Some(nexus_a), vec![SupportBundleState::Collecting] ) .await @@ -793,7 +805,7 @@ mod test { .support_bundle_list_assigned_to_nexus( &opctx, &pagparams, - nexus_a, + Some(nexus_a), vec![ SupportBundleState::Active, SupportBundleState::Collecting @@ -1435,6 +1447,7 @@ mod test { }; bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + // The bundle still appears active let report = datastore .support_bundle_fail_expunged(&opctx, &bp2, nexus_ids[1]) .await @@ -1442,8 +1455,8 @@ mod test { assert_eq!( SupportBundleExpungementReport { - bundles_failing_missing_nexus: 1, - bundles_reassigned: 1, + bundles_failing_missing_nexus: 0, + bundles_reassigned: 0, ..Default::default() }, report @@ -1453,13 +1466,7 @@ mod test { .support_bundle_get(&opctx, bundle.id.into()) .await .expect("Should be able to get bundle we just failed"); - assert_eq!(SupportBundleState::Failing, observed_bundle.state); - assert!( - observed_bundle - .reason_for_failure - .unwrap() - .contains(FAILURE_REASON_NO_NEXUS) - ); + assert_eq!(SupportBundleState::Active, observed_bundle.state); db.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 8dc13e7ab4..8deaec47dd 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -209,24 +209,29 @@ impl SupportBundleCollector { SupportBundleState::Destroying => { // Destroying is a terminal state; no one should be able to // change this state from underneath us. - self.datastore.support_bundle_delete( - opctx, - &authz_bundle, - ).await.map_err(|err| { - warn!( - &opctx.log, - "SupportBundleCollector: Could not delete 'destroying' bundle"; - "err" => %err - ); - anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err) - })?; - - return Ok( - DatabaseBundleCleanupResult::DestroyingBundleRemoved, - ); + match self + .datastore + .support_bundle_delete(opctx, &authz_bundle) + .await + { + Ok(_) | Err(Error::NotFound { .. }) => { + return Ok(DatabaseBundleCleanupResult::DestroyingBundleRemoved); + } + Err(err) => { + warn!( + &opctx.log, + "SupportBundleCollector: Could not delete 'destroying' bundle"; + "err" => %err + ); + anyhow::bail!( + "Could not delete 'destroying' bundle: {:#}", + err + ); + } + } } SupportBundleState::Failing => { - if let Err(err) = self + match self .datastore .support_bundle_update( &opctx, @@ -235,21 +240,32 @@ impl SupportBundleCollector { ) .await { - if matches!(err, Error::InvalidRequest { .. }) { + Ok(()) => { + return Ok( + DatabaseBundleCleanupResult::FailingBundleUpdated, + ); + } + Err(Error::InvalidRequest { message }) => { // It's possible that the bundle is marked "destroying" by a // user request, concurrently with our operation. // - // In this case, we log that this happened, but do nothing. - // The next iteration of this background task should treat - // this as the "Destroying" case, and delete the bundle. + // It's also possible that another concurrent Nexus + // successfully performed this "Failing" -> "Failed" + // transition. + // + // In these cases, we log that this happened, but do + // nothing. The next iteration of this background task + // should treat this as the "Destroying" case, and + // delete the bundle. info!( &opctx.log, "SupportBundleCollector: Concurrent state change failing bundle"; "bundle" => %bundle.id, - "err" => ?err, + "err_message" => ?message, ); return Ok(DatabaseBundleCleanupResult::BadState); - } else { + } + Err(err) => { warn!( &opctx.log, "Could not delete 'failing' bundle"; @@ -261,8 +277,6 @@ impl SupportBundleCollector { ); } } - - return Ok(DatabaseBundleCleanupResult::FailingBundleUpdated); } other => { // We should be filtering to only see "Destroying" and @@ -278,8 +292,8 @@ impl SupportBundleCollector { } } - // Monitors all bundles that are "destroying" or "failing" and assigned to - // this Nexus, and attempts to clear their storage from Sled Agents. + // Monitors all bundles that are "destroying" or "failing" and attempts to + // clear their storage from Sled Agents. async fn cleanup_destroyed_bundles( &self, opctx: &OpContext, @@ -290,7 +304,7 @@ impl SupportBundleCollector { .support_bundle_list_assigned_to_nexus( opctx, &pagparams, - self.nexus_id, + None, vec![ SupportBundleState::Destroying, SupportBundleState::Failing, @@ -390,7 +404,7 @@ impl SupportBundleCollector { .support_bundle_list_assigned_to_nexus( opctx, &pagparams, - self.nexus_id, + Some(self.nexus_id), vec![SupportBundleState::Collecting], ) .await; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ca22be8fd9..9f853e941c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2865,8 +2865,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( zpool_id UUID NOT NULL, dataset_id UUID NOT NULL, - -- The Nexus which is in charge of collecting the support bundle, - -- and later managing its storage. + -- The Nexus which is in charge of collecting the support bundle. assigned_nexus UUID, user_comment TEXT @@ -2880,6 +2879,10 @@ CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.suppo dataset_id ); +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state ON omicron.public.support_bundle ( + state +) WHERE state = 'failing' OR state = 'destroying'; + CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( assigned_nexus ); @@ -6854,7 +6857,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '201.0.0', NULL) + (TRUE, NOW(), NOW(), '202.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/lookup-bundle-by-state/up01.sql b/schema/crdb/lookup-bundle-by-state/up01.sql new file mode 100644 index 0000000000..a0d50b476b --- /dev/null +++ b/schema/crdb/lookup-bundle-by-state/up01.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state ON omicron.public.support_bundle ( + state +) WHERE state = 'failing' OR state = 'destroying'; +