From 10f1e207d67fd21f190a5d6ad6995ebe8b360add Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 30 Mar 2022 17:54:55 +0000 Subject: [PATCH] Removes the instance database record when provision fails In the case that instance provision fails in the final saga node, the actual booting of the Propolis zone, there was previously a no-op undo action. This leaves the instance record in the database, perpetually in a "starting" state. It can't be moved out of that state, because that requires a full instance-ensure request to the sled-agent, which tries that last action again, which fails, and ... This adds an actual undo action, which sets the instance state in the database to failed, and then deletes. That state change is needed because we can't delete instances in the "starting" state. State changes are normally only made in response to the sled agent observing a state change in the actual instance, but is valid in this case since there _is_ no such instance. --- nexus/src/sagas.rs | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/nexus/src/sagas.rs b/nexus/src/sagas.rs index 6cb886f9b4c..c0b77843239 100644 --- a/nexus/src/sagas.rs +++ b/nexus/src/sagas.rs @@ -22,6 +22,7 @@ use crucible_agent_client::{ }; use futures::StreamExt; use lazy_static::lazy_static; +use omicron_common::api::external; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -244,7 +245,7 @@ pub fn saga_instance_create() -> SagaTemplate { template_builder.append( "instance_ensure", "InstanceEnsure", - new_action_noop_undo(sic_instance_ensure), + ActionFunc::new_action(sic_instance_ensure, sic_instance_ensure_undo), ); template_builder.build() @@ -792,6 +793,45 @@ async fn sic_instance_ensure( Ok(()) } +async fn sic_instance_ensure_undo( + sagactx: ActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params(); + let instance_name = sagactx.lookup::("instance_name")?; + let opctx = OpContext::for_saga_action(&sagactx, ¶ms.serialized_authn); + + let authz_project = osagactx + .datastore() + .project_lookup_by_id(params.project_id) + .await + .map_err(ActionError::action_failed)?; + + let (authz_instance, instance) = osagactx + .datastore() + .instance_fetch(&opctx, &authz_project, &instance_name) + .await + .map_err(ActionError::action_failed)?; + + let new_state = db::model::InstanceRuntimeState { + state: db::model::InstanceState::new(external::InstanceState::Failed), + ..instance.runtime_state + }; + + osagactx + .datastore() + .instance_update_runtime(&authz_instance.id(), &new_state) + .await + .map_err(ActionError::action_failed)?; + + osagactx + .datastore() + .project_delete_instance(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + Ok(()) +} + // "Migrate Instance" saga template #[derive(Debug, Deserialize, Serialize)] pub struct ParamsInstanceMigrate {