From 003322f52a51cf6dcafc7c95019278f3d16e586b Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 24 May 2024 12:41:23 -0700
Subject: [PATCH] tear apart most of `cpapi_instances_put`

---
 clients/nexus-client/src/lib.rs      |   6 +-
 clients/sled-agent-client/src/lib.rs |   6 +-
 common/src/api/internal/nexus.rs     |   3 -
 nexus/src/app/instance.rs            | 311 ++++++++++++++-------------
 openapi/nexus-internal.json          |  50 -----
 sled-agent/src/common/instance.rs    |   1 -
 sled-agent/src/sim/collection.rs     |  13 +-
 sled-agent/src/sim/instance.rs       |   2 +-
 sled-agent/src/sim/sled_agent.rs     |   6 +-
 9 files changed, 163 insertions(+), 235 deletions(-)
diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs
index bcdd3971c03..6365825d33b 100644
--- a/clients/nexus-client/src/lib.rs
+++ b/clients/nexus-client/src/lib.rs
@@ -132,11 +132,7 @@ impl From<omicron_common::api::internal::nexus::SledInstanceState>
     fn from(
         s: omicron_common::api::internal::nexus::SledInstanceState,
     ) -> Self {
-        Self {
-            instance_state: s.instance_state.into(),
-            propolis_id: s.propolis_id,
-            vmm_state: s.vmm_state.into(),
-        }
+        Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() }
     }
 }
 
diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs
index 24bb2a6df84..d88388ced09 100644
--- a/clients/sled-agent-client/src/lib.rs
+++ b/clients/sled-agent-client/src/lib.rs
@@ -313,11 +313,7 @@ impl From<types::SledInstanceState>
     for omicron_common::api::internal::nexus::SledInstanceState
 {
     fn from(s: types::SledInstanceState) -> Self {
-        Self {
-            instance_state: s.instance_state.into(),
-            propolis_id: s.propolis_id,
-            vmm_state: s.vmm_state.into(),
-        }
+        Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() }
     }
 }
 
diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs
index de611262bfc..140b098aa67 100644
--- a/common/src/api/internal/nexus.rs
+++ b/common/src/api/internal/nexus.rs
@@ -75,9 +75,6 @@ pub struct VmmRuntimeState {
 /// specific VMM and the instance it incarnates.
 #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
 pub struct SledInstanceState {
-    /// The sled's conception of the state of the instance.
-    pub instance_state: InstanceRuntimeState,
-
     /// The ID of the VMM whose state is being reported.
     pub propolis_id: Uuid,
 
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index ed5c5674ac3..4ff51ff5462 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -1989,9 +1989,8 @@ pub(crate) async fn notify_instance_updated(
 ) -> Result<Option<InstanceUpdated>, Error> {
     let propolis_id = new_runtime_state.propolis_id;
 
-    info!(log, "received new runtime state from sled agent";
+    info!(log, "received new VMM runtime state from sled agent";
             "instance_id" => %instance_id,
-            "instance_state" => ?new_runtime_state.instance_state,
             "propolis_id" => %propolis_id,
             "vmm_state" => ?new_runtime_state.vmm_state);
 
@@ -2002,158 +2001,164 @@ pub(crate) async fn notify_instance_updated(
         .fetch()
         .await?;
 
-    // Update OPTE and Dendrite if the instance's active sled assignment
-    // changed or a migration was retired. If these actions fail, sled agent
-    // is expected to retry this update.
-    //
-    // This configuration must be updated before updating any state in CRDB
-    // so that, if the instance was migrating or has shut down, it will not
-    // appear to be able to migrate or start again until the appropriate
-    // networking state has been written. Without this interlock, another
-    // thread or another Nexus can race with this routine to write
-    // conflicting configuration.
-    //
-    // In the future, this should be replaced by a call to trigger a
-    // networking state update RPW.
-    super::instance_network::ensure_updated_instance_network_config(
-        datastore,
-        log,
-        resolver,
-        opctx,
-        opctx_alloc,
-        &authz_instance,
-        db_instance.runtime(),
-        &new_runtime_state.instance_state,
-        v2p_notification_tx.clone(),
-    )
-    .await?;
-
-    // If the supplied instance state indicates that the instance no longer
-    // has an active VMM, attempt to delete the virtual provisioning record,
-    // and the assignment of the Propolis metric producer to an oximeter
-    // collector.
-    //
-    // As with updating networking state, this must be done before
-    // committing the new runtime state to the database: once the DB is
-    // written, a new start saga can arrive and start the instance, which
-    // will try to create its own virtual provisioning charges, which will
-    // race with this operation.
-    if new_runtime_state.instance_state.propolis_id.is_none() {
-        datastore
-            .virtual_provisioning_collection_delete_instance(
-                opctx,
-                *instance_id,
-                db_instance.project_id,
-                i64::from(db_instance.ncpus.0 .0),
-                db_instance.memory,
-                (&new_runtime_state.instance_state.gen).into(),
-            )
-            .await?;
-
-        // TODO-correctness: The `notify_instance_updated` method can run
-        // concurrently with itself in some situations, such as where a
-        // sled-agent attempts to update Nexus about a stopped instance;
-        // that times out; and it makes another request to a different
-        // Nexus. The call to `unassign_producer` is racy in those
-        // situations, and we may end with instances with no metrics.
-        //
-        // This unfortunate case should be handled as part of
-        // instance-lifecycle improvements, notably using a reliable
-        // persistent workflow to correctly update the oximete assignment as
-        // an instance's state changes.
-        //
-        // Tracked in https://github.com/oxidecomputer/omicron/issues/3742.
-        super::oximeter::unassign_producer(datastore, log, opctx, instance_id)
-            .await?;
-    }
-
-    // Write the new instance and VMM states back to CRDB. This needs to be
-    // done before trying to clean up the VMM, since the datastore will only
-    // allow a VMM to be marked as deleted if it is already in a terminal
-    // state.
-    let result = datastore
-        .instance_and_vmm_update_runtime(
-            instance_id,
-            &db::model::InstanceRuntimeState::from(
-                new_runtime_state.instance_state.clone(),
-            ),
-            &propolis_id,
-            &db::model::VmmRuntimeState::from(
-                new_runtime_state.vmm_state.clone(),
-            ),
-        )
-        .await;
-
-    // If the VMM is now in a terminal state, make sure its resources get
-    // cleaned up.
-    //
-    // For idempotency, only check to see if the update was successfully
-    // processed and ignore whether the VMM record was actually updated.
-    // This is required to handle the case where this routine is called
-    // once, writes the terminal VMM state, fails before all per-VMM
-    // resources are released, returns a retriable error, and is retried:
-    // the per-VMM resources still need to be cleaned up, but the DB update
-    // will return Ok(_, false) because the database was already updated.
-    //
-    // Unlike the pre-update cases, it is legal to do this cleanup *after*
-    // committing state to the database, because a terminated VMM cannot be
-    // reused (restarting or migrating its former instance will use new VMM
-    // IDs).
-    if result.is_ok() {
-        let propolis_terminated = matches!(
-            new_runtime_state.vmm_state.state,
-            InstanceState::Destroyed | InstanceState::Failed
-        );
-
-        if propolis_terminated {
-            info!(log, "vmm is terminated, cleaning up resources";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %propolis_id);
-
-            datastore.sled_reservation_delete(opctx, propolis_id).await?;
-
-            if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? {
-                warn!(log, "failed to mark vmm record as deleted";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %propolis_id,
-                    "vmm_state" => ?new_runtime_state.vmm_state);
-            }
-        }
-    }
-
-    match result {
-        Ok((instance_updated, vmm_updated)) => {
-            info!(log, "instance and vmm updated by sled agent";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %propolis_id,
-                    "instance_updated" => instance_updated,
-                    "vmm_updated" => vmm_updated);
-            Ok(Some(InstanceUpdated { instance_updated, vmm_updated }))
-        }
-
-        // The update command should swallow object-not-found errors and
-        // return them back as failures to update, so this error case is
-        // unexpected. There's no work to do if this occurs, however.
-        Err(Error::ObjectNotFound { .. }) => {
-            error!(log, "instance/vmm update unexpectedly returned \
-                    an object not found error";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %propolis_id);
-            Ok(None)
-        }
+    let updated = datastore
+        .vmm_update_runtime(&propolis_id, &new_runtime_state.vmm_state)
+        .await?;
 
-        // If the datastore is unavailable, propagate that to the caller.
-        // TODO-robustness Really this should be any _transient_ error.  How
-        // can we distinguish?  Maybe datastore should emit something
-        // different from Error with an Into<Error>.
-        Err(error) => {
-            warn!(log, "failed to update instance from sled agent";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %propolis_id,
-                    "error" => ?error);
-            Err(error)
-        }
-    }
+    // // Update OPTE and Dendrite if the instance's active sled assignment
+    // // changed or a migration was retired. If these actions fail, sled agent
+    // // is expected to retry this update.
+    // //
+    // // This configuration must be updated before updating any state in CRDB
+    // // so that, if the instance was migrating or has shut down, it will not
+    // // appear to be able to migrate or start again until the appropriate
+    // // networking state has been written. Without this interlock, another
+    // // thread or another Nexus can race with this routine to write
+    // // conflicting configuration.
+    // //
+    // // In the future, this should be replaced by a call to trigger a
+    // // networking state update RPW.
+    // super::instance_network::ensure_updated_instance_network_config(
+    //     datastore,
+    //     log,
+    //     resolver,
+    //     opctx,
+    //     opctx_alloc,
+    //     &authz_instance,
+    //     db_instance.runtime(),
+    //     &new_runtime_state.instance_state,
+    //     v2p_notification_tx.clone(),
+    // )
+    // .await?;
+
+    // // If the supplied instance state indicates that the instance no longer
+    // // has an active VMM, attempt to delete the virtual provisioning record,
+    // // and the assignment of the Propolis metric producer to an oximeter
+    // // collector.
+    // //
+    // // As with updating networking state, this must be done before
+    // // committing the new runtime state to the database: once the DB is
+    // // written, a new start saga can arrive and start the instance, which
+    // // will try to create its own virtual provisioning charges, which will
+    // // race with this operation.
+    // if new_runtime_state.instance_state.propolis_id.is_none() {
+    //     datastore
+    //         .virtual_provisioning_collection_delete_instance(
+    //             opctx,
+    //             *instance_id,
+    //             db_instance.project_id,
+    //             i64::from(db_instance.ncpus.0 .0),
+    //             db_instance.memory,
+    //             (&new_runtime_state.instance_state.gen).into(),
+    //         )
+    //         .await?;
+
+    //     // TODO-correctness: The `notify_instance_updated` method can run
+    //     // concurrently with itself in some situations, such as where a
+    //     // sled-agent attempts to update Nexus about a stopped instance;
+    //     // that times out; and it makes another request to a different
+    //     // Nexus. The call to `unassign_producer` is racy in those
+    //     // situations, and we may end with instances with no metrics.
+    //     //
+    //     // This unfortunate case should be handled as part of
+    //     // instance-lifecycle improvements, notably using a reliable
+    //     // persistent workflow to correctly update the oximete assignment as
+    //     // an instance's state changes.
+    //     //
+    //     // Tracked in https://github.com/oxidecomputer/omicron/issues/3742.
+    //     super::oximeter::unassign_producer(datastore, log, opctx, instance_id)
+    //         .await?;
+    // }
+
+    // // Write the new instance and VMM states back to CRDB. This needs to be
+    // // done before trying to clean up the VMM, since the datastore will only
+    // // allow a VMM to be marked as deleted if it is already in a terminal
+    // // state.
+    // let result = datastore
+    //     .instance_and_vmm_update_runtime(
+    //         instance_id,
+    //         &db::model::InstanceRuntimeState::from(
+    //             new_runtime_state.instance_state.clone(),
+    //         ),
+    //         &propolis_id,
+    //         &db::model::VmmRuntimeState::from(
+    //             new_runtime_state.vmm_state.clone(),
+    //         ),
+    //     )
+    //     .await;
+
+    // // If the VMM is now in a terminal state, make sure its resources get
+    // // cleaned up.
+    // //
+    // // For idempotency, only check to see if the update was successfully
+    // // processed and ignore whether the VMM record was actually updated.
+    // // This is required to handle the case where this routine is called
+    // // once, writes the terminal VMM state, fails before all per-VMM
+    // // resources are released, returns a retriable error, and is retried:
+    // // the per-VMM resources still need to be cleaned up, but the DB update
+    // // will return Ok(_, false) because the database was already updated.
+    // //
+    // // Unlike the pre-update cases, it is legal to do this cleanup *after*
+    // // committing state to the database, because a terminated VMM cannot be
+    // // reused (restarting or migrating its former instance will use new VMM
+    // // IDs).
+    // if result.is_ok() {
+    //     let propolis_terminated = matches!(
+    //         new_runtime_state.vmm_state.state,
+    //         InstanceState::Destroyed | InstanceState::Failed
+    //     );
+
+    //     if propolis_terminated {
+    //         info!(log, "vmm is terminated, cleaning up resources";
+    //                 "instance_id" => %instance_id,
+    //                 "propolis_id" => %propolis_id);
+
+    //         datastore.sled_reservation_delete(opctx, propolis_id).await?;
+
+    //         if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? {
+    //             warn!(log, "failed to mark vmm record as deleted";
+    //                 "instance_id" => %instance_id,
+    //                 "propolis_id" => %propolis_id,
+    //                 "vmm_state" => ?new_runtime_state.vmm_state);
+    //         }
+    //     }
+    // }
+
+    // match result {
+    //     Ok((instance_updated, vmm_updated)) => {
+    //         info!(log, "instance and vmm updated by sled agent";
+    //                 "instance_id" => %instance_id,
+    //                 "propolis_id" => %propolis_id,
+    //                 "instance_updated" => instance_updated,
+    //                 "vmm_updated" => vmm_updated);
+    //         Ok(Some(InstanceUpdated { instance_updated, vmm_updated }))
+    //     }
+
+    //     // The update command should swallow object-not-found errors and
+    //     // return them back as failures to update, so this error case is
+    //     // unexpected. There's no work to do if this occurs, however.
+    //     Err(Error::ObjectNotFound { .. }) => {
+    //         error!(log, "instance/vmm update unexpectedly returned \
+    //                 an object not found error";
+    //                 "instance_id" => %instance_id,
+    //                 "propolis_id" => %propolis_id);
+    //         Ok(None)
+    //     }
+
+    //     // If the datastore is unavailable, propagate that to the caller.
+    //     // TODO-robustness Really this should be any _transient_ error.  How
+    //     // can we distinguish?  Maybe datastore should emit something
+    //     // different from Error with an Into<Error>.
+    //     Err(error) => {
+    //         warn!(log, "failed to update instance from sled agent";
+    //                 "instance_id" => %instance_id,
+    //                 "propolis_id" => %propolis_id,
+    //                 "error" => ?error);
+    //         Err(error)
+    //     }
+
+    // }
+    Ok(Some(InstanceUpdated { vmm_updated: updated, instance_updated: false }))
 }
 
 #[cfg(test)]
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index ad109a18fa6..97beb7372aa 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -3049,47 +3049,6 @@
           }
         ]
       },
-      "InstanceRuntimeState": {
-        "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.",
-        "type": "object",
-        "properties": {
-          "dst_propolis_id": {
-            "nullable": true,
-            "description": "If a migration is active, the ID of the target VMM.",
-            "type": "string",
-            "format": "uuid"
-          },
-          "gen": {
-            "description": "Generation number for this state.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Generation"
-              }
-            ]
-          },
-          "migration_id": {
-            "nullable": true,
-            "description": "If a migration is active, the ID of that migration.",
-            "type": "string",
-            "format": "uuid"
-          },
-          "propolis_id": {
-            "nullable": true,
-            "description": "The instance's currently active VMM ID.",
-            "type": "string",
-            "format": "uuid"
-          },
-          "time_updated": {
-            "description": "Timestamp for this information.",
-            "type": "string",
-            "format": "date-time"
-          }
-        },
-        "required": [
-          "gen",
-          "time_updated"
-        ]
-      },
       "InstanceState": {
         "description": "Running state of an Instance (primarily: booted or stopped)\n\nThis typically reflects whether it's starting, running, stopping, or stopped, but also includes states related to the Instance's lifecycle",
         "oneOf": [
@@ -4581,14 +4540,6 @@
         "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.",
         "type": "object",
         "properties": {
-          "instance_state": {
-            "description": "The sled's conception of the state of the instance.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/InstanceRuntimeState"
-              }
-            ]
-          },
           "propolis_id": {
             "description": "The ID of the VMM whose state is being reported.",
             "type": "string",
@@ -4604,7 +4555,6 @@
           }
         },
         "required": [
-          "instance_state",
           "propolis_id",
           "vmm_state"
         ]
diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs
index d7ee8982e08..707a553cfe7 100644
--- a/sled-agent/src/common/instance.rs
+++ b/sled-agent/src/common/instance.rs
@@ -226,7 +226,6 @@ impl InstanceStates {
     /// use the `instance` or `vmm` accessors instead.
     pub fn sled_instance_state(&self) -> SledInstanceState {
         SledInstanceState {
-            instance_state: self.instance.clone(),
             vmm_state: self.vmm.clone(),
             propolis_id: self.propolis_id,
         }
diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs
index bbc3e440aba..39700870049 100644
--- a/sled-agent/src/sim/collection.rs
+++ b/sled-agent/src/sim/collection.rs
@@ -447,11 +447,7 @@ mod test {
             time_updated: Utc::now(),
         };
 
-        let state = SledInstanceState {
-            instance_state: instance_vmm,
-            vmm_state,
-            propolis_id,
-        };
+        let state = SledInstanceState { vmm_state, propolis_id };
 
         SimObject::new_simulated_auto(&state, logctx.log.new(o!()))
     }
@@ -500,14 +496,8 @@ mod test {
         assert!(dropped.is_none());
         assert!(instance.object.desired().is_none());
         let rnext = instance.object.current();
-        assert!(rnext.instance_state.gen > rprev.instance_state.gen);
         assert!(rnext.vmm_state.gen > rprev.vmm_state.gen);
-        assert!(
-            rnext.instance_state.time_updated
-                >= rprev.instance_state.time_updated
-        );
         assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated);
-        assert!(rnext.instance_state.propolis_id.is_none());
         assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed);
         assert!(rx.try_next().is_err());
 
@@ -631,7 +621,6 @@ mod test {
         assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated);
         assert_eq!(rprev.vmm_state.state, InstanceState::Stopping);
         assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed);
-        assert!(rnext.instance_state.gen > rprev.instance_state.gen);
         logctx.cleanup_successful();
     }
 
diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs
index 8b00adce60d..38ba10489c5 100644
--- a/sled-agent/src/sim/instance.rs
+++ b/sled-agent/src/sim/instance.rs
@@ -428,7 +428,7 @@ impl Simulatable for SimInstance {
         SimInstance {
             inner: Arc::new(Mutex::new(SimInstanceInner {
                 state: InstanceStates::new(
-                    current.instance_state,
+                    todo!(),
                     current.vmm_state,
                     current.propolis_id,
                 ),
diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs
index d9308bf769c..16a3f10938b 100644
--- a/sled-agent/src/sim/sled_agent.rs
+++ b/sled-agent/src/sim/sled_agent.rs
@@ -344,11 +344,7 @@ impl SledAgent {
             .instances
             .sim_ensure(
                 &instance_id,
-                SledInstanceState {
-                    instance_state: instance_runtime,
-                    vmm_state: vmm_runtime,
-                    propolis_id,
-                },
+                SledInstanceState { vmm_state: vmm_runtime, propolis_id },
                 None,
             )
             .await?;