-
Notifications
You must be signed in to change notification settings - Fork 59
[support bundles] Don't fail already-collected bundles #9267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -209,24 +209,29 @@ impl SupportBundleCollector { | |
| SupportBundleState::Destroying => { | ||
| // Destroying is a terminal state; no one should be able to | ||
| // change this state from underneath us. | ||
| self.datastore.support_bundle_delete( | ||
| opctx, | ||
| &authz_bundle, | ||
| ).await.map_err(|err| { | ||
| warn!( | ||
| &opctx.log, | ||
| "SupportBundleCollector: Could not delete 'destroying' bundle"; | ||
| "err" => %err | ||
| ); | ||
| anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err) | ||
| })?; | ||
|
|
||
| return Ok( | ||
| DatabaseBundleCleanupResult::DestroyingBundleRemoved, | ||
| ); | ||
| match self | ||
| .datastore | ||
| .support_bundle_delete(opctx, &authz_bundle) | ||
| .await | ||
| { | ||
| Ok(_) | Err(Error::NotFound { .. }) => { | ||
| return Ok(DatabaseBundleCleanupResult::DestroyingBundleRemoved); | ||
| } | ||
| Err(err) => { | ||
| warn!( | ||
| &opctx.log, | ||
| "SupportBundleCollector: Could not delete 'destroying' bundle"; | ||
| "err" => %err | ||
| ); | ||
| anyhow::bail!( | ||
| "Could not delete 'destroying' bundle: {:#}", | ||
| err | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| SupportBundleState::Failing => { | ||
| if let Err(err) = self | ||
| match self | ||
| .datastore | ||
| .support_bundle_update( | ||
| &opctx, | ||
|
|
@@ -235,21 +240,32 @@ impl SupportBundleCollector { | |
| ) | ||
| .await | ||
| { | ||
| if matches!(err, Error::InvalidRequest { .. }) { | ||
| Ok(()) => { | ||
| return Ok( | ||
| DatabaseBundleCleanupResult::FailingBundleUpdated, | ||
| ); | ||
| } | ||
| Err(Error::InvalidRequest { message }) => { | ||
| // It's possible that the bundle is marked "destroying" by a | ||
| // user request, concurrently with our operation. | ||
| // | ||
| // In this case, we log that this happened, but do nothing. | ||
| // The next iteration of this background task should treat | ||
| // this as the "Destroying" case, and delete the bundle. | ||
| // It's also possible that another concurrent Nexus | ||
| // successfully performed this "Failing" -> "Failed" | ||
| // transition. | ||
| // | ||
| // In these cases, we log that this happened, but do | ||
| // nothing. The next iteration of this background task | ||
| // should treat this as the "Destroying" case, and | ||
| // delete the bundle. | ||
| info!( | ||
| &opctx.log, | ||
| "SupportBundleCollector: Concurrent state change failing bundle"; | ||
| "bundle" => %bundle.id, | ||
| "err" => ?err, | ||
| "err_message" => ?message, | ||
| ); | ||
| return Ok(DatabaseBundleCleanupResult::BadState); | ||
| } else { | ||
| } | ||
| Err(err) => { | ||
| warn!( | ||
| &opctx.log, | ||
| "Could not delete 'failing' bundle"; | ||
|
|
@@ -261,8 +277,6 @@ impl SupportBundleCollector { | |
| ); | ||
| } | ||
| } | ||
|
|
||
| return Ok(DatabaseBundleCleanupResult::FailingBundleUpdated); | ||
| } | ||
| other => { | ||
| // We should be filtering to only see "Destroying" and | ||
|
|
@@ -278,8 +292,8 @@ impl SupportBundleCollector { | |
| } | ||
| } | ||
|
|
||
| // Monitors all bundles that are "destroying" or "failing" and assigned to | ||
| // this Nexus, and attempts to clear their storage from Sled Agents. | ||
| // Monitors all bundles that are "destroying" or "failing" and attempts to | ||
| // clear their storage from Sled Agents. | ||
| async fn cleanup_destroyed_bundles( | ||
| &self, | ||
| opctx: &OpContext, | ||
|
|
@@ -290,7 +304,7 @@ impl SupportBundleCollector { | |
| .support_bundle_list_assigned_to_nexus( | ||
| opctx, | ||
| &pagparams, | ||
| self.nexus_id, | ||
| None, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Old: Each nexus queries their own set of bundles to see what should be destroyed. This does mean bundle deletion may happen concurrently, from multiple distinct Nexuses, since there is no real meaning of "ownership" after collection has completed. I think this should be safe - the process of deleting a bundle involves:
Both of which should be independently safe from concurrent Nexuses |
||
| vec![ | ||
| SupportBundleState::Destroying, | ||
| SupportBundleState::Failing, | ||
|
|
@@ -390,7 +404,7 @@ impl SupportBundleCollector { | |
| .support_bundle_list_assigned_to_nexus( | ||
| opctx, | ||
| &pagparams, | ||
| self.nexus_id, | ||
| Some(self.nexus_id), | ||
| vec![SupportBundleState::Collecting], | ||
| ) | ||
| .await; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| CREATE INDEX IF NOT EXISTS lookup_bundle_by_state ON omicron.public.support_bundle ( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're indexing by state, not by nexus ID, with this new PR |
||
| state | ||
| ) WHERE state = 'failing' OR state = 'destroying'; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"In-progress" bundles get marked failed if their "owning Nexus" dies.
Otherwise: there is nothing to mark failed, when a Nexus gets expunged.