Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(201, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(202, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(202, "lookup-bundle-by-state"),
KnownVersion::new(201, "scim-client-bearer-token"),
KnownVersion::new(200, "dual-stack-network-interfaces"),
KnownVersion::new(199, "multicast-pool-support"),
Expand Down
49 changes: 28 additions & 21 deletions nexus/db-queries/src/db/datastore/support_bundle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,23 +191,34 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Lists one page of support bundles in a particular state, assigned to
/// a particular Nexus.
/// Lists one page of support bundles in a particular state.
///
/// If `nexus_id` is not None, also filters for bundles assigned to a
/// particular Nexus. Otherwise, lists all bundles in the requested
/// states.
pub async fn support_bundle_list_assigned_to_nexus(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, Uuid>,
nexus_id: OmicronZoneUuid,
nexus_id: Option<OmicronZoneUuid>,
states: Vec<SupportBundleState>,
) -> ListResultVec<SupportBundle> {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
use nexus_db_schema::schema::support_bundle::dsl;

let conn = self.pool_connection_authorized(opctx).await?;
paginated(dsl::support_bundle, dsl::id, pagparams)
.filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid()))
let query = paginated(dsl::support_bundle, dsl::id, pagparams)
.filter(dsl::state.eq_any(states))
.order(dsl::time_created.asc())
.order(dsl::time_created.asc());

let query = match nexus_id {
Some(id) => {
query.filter(dsl::assigned_nexus.eq(id.into_untyped_uuid()))
}
None => query,
};

query
.select(SupportBundle::as_select())
.load_async(&*conn)
.await
Expand Down Expand Up @@ -324,8 +335,9 @@ impl DataStore {
.execute_async(conn)
.await?;

// Find all bundles on nexuses that no longer exist.
// Find all collecting bundles on nexuses that no longer exist.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"In-progress" bundles get marked failed if their "owning Nexus" dies.

Otherwise: there is nothing to mark failed, when a Nexus gets expunged.

let bundles_with_bad_nexuses = dsl::support_bundle
.filter(dsl::state.eq(SupportBundleState::Collecting))
.filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones))
.select(SupportBundle::as_select())
.load_async(conn)
Expand Down Expand Up @@ -704,7 +716,7 @@ mod test {
.support_bundle_list_assigned_to_nexus(
&opctx,
&pagparams,
nexus_a,
Some(nexus_a),
vec![SupportBundleState::Collecting]
)
.await
Expand Down Expand Up @@ -732,7 +744,7 @@ mod test {
.support_bundle_list_assigned_to_nexus(
&opctx,
&pagparams,
nexus_a,
Some(nexus_a),
vec![SupportBundleState::Collecting]
)
.await
Expand All @@ -747,7 +759,7 @@ mod test {
.support_bundle_list_assigned_to_nexus(
&opctx,
&pagparams,
nexus_b,
Some(nexus_b),
vec![SupportBundleState::Collecting]
)
.await
Expand Down Expand Up @@ -776,7 +788,7 @@ mod test {
.support_bundle_list_assigned_to_nexus(
&opctx,
&pagparams,
nexus_a,
Some(nexus_a),
vec![SupportBundleState::Collecting]
)
.await
Expand All @@ -793,7 +805,7 @@ mod test {
.support_bundle_list_assigned_to_nexus(
&opctx,
&pagparams,
nexus_a,
Some(nexus_a),
vec![
SupportBundleState::Active,
SupportBundleState::Collecting
Expand Down Expand Up @@ -1435,15 +1447,16 @@ mod test {
};
bp_insert_and_make_target(&opctx, &datastore, &bp2).await;

// The bundle still appears active
let report = datastore
.support_bundle_fail_expunged(&opctx, &bp2, nexus_ids[1])
.await
.expect("Should have been able to mark bundle state as destroying");

assert_eq!(
SupportBundleExpungementReport {
bundles_failing_missing_nexus: 1,
bundles_reassigned: 1,
bundles_failing_missing_nexus: 0,
bundles_reassigned: 0,
..Default::default()
},
report
Expand All @@ -1453,13 +1466,7 @@ mod test {
.support_bundle_get(&opctx, bundle.id.into())
.await
.expect("Should be able to get bundle we just failed");
assert_eq!(SupportBundleState::Failing, observed_bundle.state);
assert!(
observed_bundle
.reason_for_failure
.unwrap()
.contains(FAILURE_REASON_NO_NEXUS)
);
assert_eq!(SupportBundleState::Active, observed_bundle.state);

db.terminate().await;
logctx.cleanup_successful();
Expand Down
70 changes: 42 additions & 28 deletions nexus/src/app/background/tasks/support_bundle_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,24 +209,29 @@ impl SupportBundleCollector {
SupportBundleState::Destroying => {
// Destroying is a terminal state; no one should be able to
// change this state from underneath us.
self.datastore.support_bundle_delete(
opctx,
&authz_bundle,
).await.map_err(|err| {
warn!(
&opctx.log,
"SupportBundleCollector: Could not delete 'destroying' bundle";
"err" => %err
);
anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err)
})?;

return Ok(
DatabaseBundleCleanupResult::DestroyingBundleRemoved,
);
match self
.datastore
.support_bundle_delete(opctx, &authz_bundle)
.await
{
Ok(_) | Err(Error::NotFound { .. }) => {
return Ok(DatabaseBundleCleanupResult::DestroyingBundleRemoved);
}
Err(err) => {
warn!(
&opctx.log,
"SupportBundleCollector: Could not delete 'destroying' bundle";
"err" => %err
);
anyhow::bail!(
"Could not delete 'destroying' bundle: {:#}",
err
);
}
}
}
SupportBundleState::Failing => {
if let Err(err) = self
match self
.datastore
.support_bundle_update(
&opctx,
Expand All @@ -235,21 +240,32 @@ impl SupportBundleCollector {
)
.await
{
if matches!(err, Error::InvalidRequest { .. }) {
Ok(()) => {
return Ok(
DatabaseBundleCleanupResult::FailingBundleUpdated,
);
}
Err(Error::InvalidRequest { message }) => {
// It's possible that the bundle is marked "destroying" by a
// user request, concurrently with our operation.
//
// In this case, we log that this happened, but do nothing.
// The next iteration of this background task should treat
// this as the "Destroying" case, and delete the bundle.
// It's also possible that another concurrent Nexus
// successfully performed this "Failing" -> "Failed"
// transition.
//
// In these cases, we log that this happened, but do
// nothing. The next iteration of this background task
// should treat this as the "Destroying" case, and
// delete the bundle.
info!(
&opctx.log,
"SupportBundleCollector: Concurrent state change failing bundle";
"bundle" => %bundle.id,
"err" => ?err,
"err_message" => ?message,
);
return Ok(DatabaseBundleCleanupResult::BadState);
} else {
}
Err(err) => {
warn!(
&opctx.log,
"Could not delete 'failing' bundle";
Expand All @@ -261,8 +277,6 @@ impl SupportBundleCollector {
);
}
}

return Ok(DatabaseBundleCleanupResult::FailingBundleUpdated);
}
other => {
// We should be filtering to only see "Destroying" and
Expand All @@ -278,8 +292,8 @@ impl SupportBundleCollector {
}
}

// Monitors all bundles that are "destroying" or "failing" and assigned to
// this Nexus, and attempts to clear their storage from Sled Agents.
// Monitors all bundles that are "destroying" or "failing" and attempts to
// clear their storage from Sled Agents.
async fn cleanup_destroyed_bundles(
&self,
opctx: &OpContext,
Expand All @@ -290,7 +304,7 @@ impl SupportBundleCollector {
.support_bundle_list_assigned_to_nexus(
opctx,
&pagparams,
self.nexus_id,
None,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Old: Each nexus queries their own set of bundles to see what should be destroyed.
New: All nexuses query any bundles in these states, and try to clean them up concurrently.

This does mean bundle deletion may happen concurrently, from multiple distinct Nexuses, since there is no real meaning of "ownership" after collection has completed.

I think this should be safe - the process of deleting a bundle involves:

  1. Delete from the sled where it's stored
  2. Delete from the database, updating the record

Both of which should be independently safe from concurrent Nexuses

vec![
SupportBundleState::Destroying,
SupportBundleState::Failing,
Expand Down Expand Up @@ -390,7 +404,7 @@ impl SupportBundleCollector {
.support_bundle_list_assigned_to_nexus(
opctx,
&pagparams,
self.nexus_id,
Some(self.nexus_id),
vec![SupportBundleState::Collecting],
)
.await;
Expand Down
9 changes: 6 additions & 3 deletions schema/crdb/dbinit.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2865,8 +2865,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.support_bundle (
zpool_id UUID NOT NULL,
dataset_id UUID NOT NULL,

-- The Nexus which is in charge of collecting the support bundle,
-- and later managing its storage.
-- The Nexus which is in charge of collecting the support bundle.
assigned_nexus UUID,

user_comment TEXT
Expand All @@ -2880,6 +2879,10 @@ CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.suppo
dataset_id
);

CREATE INDEX IF NOT EXISTS lookup_bundle_by_state ON omicron.public.support_bundle (
state
) WHERE state = 'failing' OR state = 'destroying';

CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle (
assigned_nexus
);
Expand Down Expand Up @@ -6854,7 +6857,7 @@ INSERT INTO omicron.public.db_metadata (
version,
target_version
) VALUES
(TRUE, NOW(), NOW(), '201.0.0', NULL)
(TRUE, NOW(), NOW(), '202.0.0', NULL)
ON CONFLICT DO NOTHING;

COMMIT;
4 changes: 4 additions & 0 deletions schema/crdb/lookup-bundle-by-state/up01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CREATE INDEX IF NOT EXISTS lookup_bundle_by_state ON omicron.public.support_bundle (
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're indexing by state, not by nexus ID, with this new PR

state
) WHERE state = 'failing' OR state = 'destroying';

Loading