From 003322f52a51cf6dcafc7c95019278f3d16e586b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 12:41:23 -0700 Subject: [PATCH] tear apart most of `cpapi_instances_put` --- clients/nexus-client/src/lib.rs | 6 +- clients/sled-agent-client/src/lib.rs | 6 +- common/src/api/internal/nexus.rs | 3 - nexus/src/app/instance.rs | 311 ++++++++++++++------------- openapi/nexus-internal.json | 50 ----- sled-agent/src/common/instance.rs | 1 - sled-agent/src/sim/collection.rs | 13 +- sled-agent/src/sim/instance.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 6 +- 9 files changed, 163 insertions(+), 235 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index bcdd3971c03..6365825d33b 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -132,11 +132,7 @@ impl From fn from( s: omicron_common::api::internal::nexus::SledInstanceState, ) -> Self { - Self { - instance_state: s.instance_state.into(), - propolis_id: s.propolis_id, - vmm_state: s.vmm_state.into(), - } + Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() } } } diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 24bb2a6df84..d88388ced09 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -313,11 +313,7 @@ impl From for omicron_common::api::internal::nexus::SledInstanceState { fn from(s: types::SledInstanceState) -> Self { - Self { - instance_state: s.instance_state.into(), - propolis_id: s.propolis_id, - vmm_state: s.vmm_state.into(), - } + Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() } } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index de611262bfc..140b098aa67 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -75,9 +75,6 @@ pub struct VmmRuntimeState { /// specific VMM and the instance it incarnates. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstanceState { - /// The sled's conception of the state of the instance. - pub instance_state: InstanceRuntimeState, - /// The ID of the VMM whose state is being reported. pub propolis_id: Uuid, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index ed5c5674ac3..4ff51ff5462 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1989,9 +1989,8 @@ pub(crate) async fn notify_instance_updated( ) -> Result, Error> { let propolis_id = new_runtime_state.propolis_id; - info!(log, "received new runtime state from sled agent"; + info!(log, "received new VMM runtime state from sled agent"; "instance_id" => %instance_id, - "instance_state" => ?new_runtime_state.instance_state, "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state); @@ -2002,158 +2001,164 @@ pub(crate) async fn notify_instance_updated( .fetch() .await?; - // Update OPTE and Dendrite if the instance's active sled assignment - // changed or a migration was retired. If these actions fail, sled agent - // is expected to retry this update. - // - // This configuration must be updated before updating any state in CRDB - // so that, if the instance was migrating or has shut down, it will not - // appear to be able to migrate or start again until the appropriate - // networking state has been written. Without this interlock, another - // thread or another Nexus can race with this routine to write - // conflicting configuration. - // - // In the future, this should be replaced by a call to trigger a - // networking state update RPW. - super::instance_network::ensure_updated_instance_network_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - &authz_instance, - db_instance.runtime(), - &new_runtime_state.instance_state, - v2p_notification_tx.clone(), - ) - .await?; - - // If the supplied instance state indicates that the instance no longer - // has an active VMM, attempt to delete the virtual provisioning record, - // and the assignment of the Propolis metric producer to an oximeter - // collector. - // - // As with updating networking state, this must be done before - // committing the new runtime state to the database: once the DB is - // written, a new start saga can arrive and start the instance, which - // will try to create its own virtual provisioning charges, which will - // race with this operation. - if new_runtime_state.instance_state.propolis_id.is_none() { - datastore - .virtual_provisioning_collection_delete_instance( - opctx, - *instance_id, - db_instance.project_id, - i64::from(db_instance.ncpus.0 .0), - db_instance.memory, - (&new_runtime_state.instance_state.gen).into(), - ) - .await?; - - // TODO-correctness: The `notify_instance_updated` method can run - // concurrently with itself in some situations, such as where a - // sled-agent attempts to update Nexus about a stopped instance; - // that times out; and it makes another request to a different - // Nexus. The call to `unassign_producer` is racy in those - // situations, and we may end with instances with no metrics. - // - // This unfortunate case should be handled as part of - // instance-lifecycle improvements, notably using a reliable - // persistent workflow to correctly update the oximete assignment as - // an instance's state changes. - // - // Tracked in https://github.com/oxidecomputer/omicron/issues/3742. - super::oximeter::unassign_producer(datastore, log, opctx, instance_id) - .await?; - } - - // Write the new instance and VMM states back to CRDB. This needs to be - // done before trying to clean up the VMM, since the datastore will only - // allow a VMM to be marked as deleted if it is already in a terminal - // state. - let result = datastore - .instance_and_vmm_update_runtime( - instance_id, - &db::model::InstanceRuntimeState::from( - new_runtime_state.instance_state.clone(), - ), - &propolis_id, - &db::model::VmmRuntimeState::from( - new_runtime_state.vmm_state.clone(), - ), - ) - .await; - - // If the VMM is now in a terminal state, make sure its resources get - // cleaned up. - // - // For idempotency, only check to see if the update was successfully - // processed and ignore whether the VMM record was actually updated. - // This is required to handle the case where this routine is called - // once, writes the terminal VMM state, fails before all per-VMM - // resources are released, returns a retriable error, and is retried: - // the per-VMM resources still need to be cleaned up, but the DB update - // will return Ok(_, false) because the database was already updated. - // - // Unlike the pre-update cases, it is legal to do this cleanup *after* - // committing state to the database, because a terminated VMM cannot be - // reused (restarting or migrating its former instance will use new VMM - // IDs). - if result.is_ok() { - let propolis_terminated = matches!( - new_runtime_state.vmm_state.state, - InstanceState::Destroyed | InstanceState::Failed - ); - - if propolis_terminated { - info!(log, "vmm is terminated, cleaning up resources"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - - datastore.sled_reservation_delete(opctx, propolis_id).await?; - - if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { - warn!(log, "failed to mark vmm record as deleted"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); - } - } - } - - match result { - Ok((instance_updated, vmm_updated)) => { - info!(log, "instance and vmm updated by sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "instance_updated" => instance_updated, - "vmm_updated" => vmm_updated); - Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) - } - - // The update command should swallow object-not-found errors and - // return them back as failures to update, so this error case is - // unexpected. There's no work to do if this occurs, however. - Err(Error::ObjectNotFound { .. }) => { - error!(log, "instance/vmm update unexpectedly returned \ - an object not found error"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - Ok(None) - } + let updated = datastore + .vmm_update_runtime(&propolis_id, &new_runtime_state.vmm_state) + .await?; - // If the datastore is unavailable, propagate that to the caller. - // TODO-robustness Really this should be any _transient_ error. How - // can we distinguish? Maybe datastore should emit something - // different from Error with an Into. - Err(error) => { - warn!(log, "failed to update instance from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "error" => ?error); - Err(error) - } - } + // // Update OPTE and Dendrite if the instance's active sled assignment + // // changed or a migration was retired. If these actions fail, sled agent + // // is expected to retry this update. + // // + // // This configuration must be updated before updating any state in CRDB + // // so that, if the instance was migrating or has shut down, it will not + // // appear to be able to migrate or start again until the appropriate + // // networking state has been written. Without this interlock, another + // // thread or another Nexus can race with this routine to write + // // conflicting configuration. + // // + // // In the future, this should be replaced by a call to trigger a + // // networking state update RPW. + // super::instance_network::ensure_updated_instance_network_config( + // datastore, + // log, + // resolver, + // opctx, + // opctx_alloc, + // &authz_instance, + // db_instance.runtime(), + // &new_runtime_state.instance_state, + // v2p_notification_tx.clone(), + // ) + // .await?; + + // // If the supplied instance state indicates that the instance no longer + // // has an active VMM, attempt to delete the virtual provisioning record, + // // and the assignment of the Propolis metric producer to an oximeter + // // collector. + // // + // // As with updating networking state, this must be done before + // // committing the new runtime state to the database: once the DB is + // // written, a new start saga can arrive and start the instance, which + // // will try to create its own virtual provisioning charges, which will + // // race with this operation. + // if new_runtime_state.instance_state.propolis_id.is_none() { + // datastore + // .virtual_provisioning_collection_delete_instance( + // opctx, + // *instance_id, + // db_instance.project_id, + // i64::from(db_instance.ncpus.0 .0), + // db_instance.memory, + // (&new_runtime_state.instance_state.gen).into(), + // ) + // .await?; + + // // TODO-correctness: The `notify_instance_updated` method can run + // // concurrently with itself in some situations, such as where a + // // sled-agent attempts to update Nexus about a stopped instance; + // // that times out; and it makes another request to a different + // // Nexus. The call to `unassign_producer` is racy in those + // // situations, and we may end with instances with no metrics. + // // + // // This unfortunate case should be handled as part of + // // instance-lifecycle improvements, notably using a reliable + // // persistent workflow to correctly update the oximete assignment as + // // an instance's state changes. + // // + // // Tracked in https://github.com/oxidecomputer/omicron/issues/3742. + // super::oximeter::unassign_producer(datastore, log, opctx, instance_id) + // .await?; + // } + + // // Write the new instance and VMM states back to CRDB. This needs to be + // // done before trying to clean up the VMM, since the datastore will only + // // allow a VMM to be marked as deleted if it is already in a terminal + // // state. + // let result = datastore + // .instance_and_vmm_update_runtime( + // instance_id, + // &db::model::InstanceRuntimeState::from( + // new_runtime_state.instance_state.clone(), + // ), + // &propolis_id, + // &db::model::VmmRuntimeState::from( + // new_runtime_state.vmm_state.clone(), + // ), + // ) + // .await; + + // // If the VMM is now in a terminal state, make sure its resources get + // // cleaned up. + // // + // // For idempotency, only check to see if the update was successfully + // // processed and ignore whether the VMM record was actually updated. + // // This is required to handle the case where this routine is called + // // once, writes the terminal VMM state, fails before all per-VMM + // // resources are released, returns a retriable error, and is retried: + // // the per-VMM resources still need to be cleaned up, but the DB update + // // will return Ok(_, false) because the database was already updated. + // // + // // Unlike the pre-update cases, it is legal to do this cleanup *after* + // // committing state to the database, because a terminated VMM cannot be + // // reused (restarting or migrating its former instance will use new VMM + // // IDs). + // if result.is_ok() { + // let propolis_terminated = matches!( + // new_runtime_state.vmm_state.state, + // InstanceState::Destroyed | InstanceState::Failed + // ); + + // if propolis_terminated { + // info!(log, "vmm is terminated, cleaning up resources"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id); + + // datastore.sled_reservation_delete(opctx, propolis_id).await?; + + // if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { + // warn!(log, "failed to mark vmm record as deleted"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "vmm_state" => ?new_runtime_state.vmm_state); + // } + // } + // } + + // match result { + // Ok((instance_updated, vmm_updated)) => { + // info!(log, "instance and vmm updated by sled agent"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "instance_updated" => instance_updated, + // "vmm_updated" => vmm_updated); + // Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) + // } + + // // The update command should swallow object-not-found errors and + // // return them back as failures to update, so this error case is + // // unexpected. There's no work to do if this occurs, however. + // Err(Error::ObjectNotFound { .. }) => { + // error!(log, "instance/vmm update unexpectedly returned \ + // an object not found error"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id); + // Ok(None) + // } + + // // If the datastore is unavailable, propagate that to the caller. + // // TODO-robustness Really this should be any _transient_ error. How + // // can we distinguish? Maybe datastore should emit something + // // different from Error with an Into. + // Err(error) => { + // warn!(log, "failed to update instance from sled agent"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "error" => ?error); + // Err(error) + // } + + // } + Ok(Some(InstanceUpdated { vmm_updated: updated, instance_updated: false })) } #[cfg(test)] diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index ad109a18fa6..97beb7372aa 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3049,47 +3049,6 @@ } ] }, - "InstanceRuntimeState": { - "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.", - "type": "object", - "properties": { - "dst_propolis_id": { - "nullable": true, - "description": "If a migration is active, the ID of the target VMM.", - "type": "string", - "format": "uuid" - }, - "gen": { - "description": "Generation number for this state.", - "allOf": [ - { - "$ref": "#/components/schemas/Generation" - } - ] - }, - "migration_id": { - "nullable": true, - "description": "If a migration is active, the ID of that migration.", - "type": "string", - "format": "uuid" - }, - "propolis_id": { - "nullable": true, - "description": "The instance's currently active VMM ID.", - "type": "string", - "format": "uuid" - }, - "time_updated": { - "description": "Timestamp for this information.", - "type": "string", - "format": "date-time" - } - }, - "required": [ - "gen", - "time_updated" - ] - }, "InstanceState": { "description": "Running state of an Instance (primarily: booted or stopped)\n\nThis typically reflects whether it's starting, running, stopping, or stopped, but also includes states related to the Instance's lifecycle", "oneOf": [ @@ -4581,14 +4540,6 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "instance_state": { - "description": "The sled's conception of the state of the instance.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceRuntimeState" - } - ] - }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "type": "string", @@ -4604,7 +4555,6 @@ } }, "required": [ - "instance_state", "propolis_id", "vmm_state" ] diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index d7ee8982e08..707a553cfe7 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -226,7 +226,6 @@ impl InstanceStates { /// use the `instance` or `vmm` accessors instead. pub fn sled_instance_state(&self) -> SledInstanceState { SledInstanceState { - instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, } diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index bbc3e440aba..39700870049 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -447,11 +447,7 @@ mod test { time_updated: Utc::now(), }; - let state = SledInstanceState { - instance_state: instance_vmm, - vmm_state, - propolis_id, - }; + let state = SledInstanceState { vmm_state, propolis_id }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) } @@ -500,14 +496,8 @@ mod test { assert!(dropped.is_none()); assert!(instance.object.desired().is_none()); let rnext = instance.object.current(); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); assert!(rnext.vmm_state.gen > rprev.vmm_state.gen); - assert!( - rnext.instance_state.time_updated - >= rprev.instance_state.time_updated - ); assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); - assert!(rnext.instance_state.propolis_id.is_none()); assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed); assert!(rx.try_next().is_err()); @@ -631,7 +621,6 @@ mod test { assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); assert_eq!(rprev.vmm_state.state, InstanceState::Stopping); assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); logctx.cleanup_successful(); } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 8b00adce60d..38ba10489c5 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -428,7 +428,7 @@ impl Simulatable for SimInstance { SimInstance { inner: Arc::new(Mutex::new(SimInstanceInner { state: InstanceStates::new( - current.instance_state, + todo!(), current.vmm_state, current.propolis_id, ), diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d9308bf769c..16a3f10938b 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -344,11 +344,7 @@ impl SledAgent { .instances .sim_ensure( &instance_id, - SledInstanceState { - instance_state: instance_runtime, - vmm_state: vmm_runtime, - propolis_id, - }, + SledInstanceState { vmm_state: vmm_runtime, propolis_id }, None, ) .await?;