oxidecomputer · gjcolombo · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
@@ -163,6 +163,36 @@ impl DataStore {
         Ok(db_instance)
     }
 
+    /// Fetches information about a deleted instance. This can be used to
+    /// query the properties an instance had at the time it was deleted, which
+    /// can be useful when cleaning up a deleted instance.
+    pub async fn instance_fetch_deleted(
+        &self,
+        opctx: &OpContext,
+        authz_instance: &authz::Instance,
+    ) -> LookupResult<Instance> {
+        opctx.authorize(authz::Action::Read, authz_instance).await?;
+
+        use db::schema::instance::dsl;
+        let instance = dsl::instance
+            .filter(dsl::id.eq(authz_instance.id()))
+            .filter(dsl::time_deleted.is_not_null())
+            .select(Instance::as_select())
+            .get_result_async(self.pool_authorized(opctx).await?)
+            .await
+            .map_err(|e| {
+                public_error_from_diesel_pool(
+                    e,
+                    ErrorHandler::NotFoundByLookup(
+                        ResourceType::Instance,
+                        LookupType::ById(authz_instance.id()),
+                    ),
+                )
+            })?;
+
+        Ok(instance)
+    }
+
     // TODO-design It's tempting to return the updated state of the Instance
     // here because it's convenient for consumers and by using a RETURNING
     // clause, we could ensure that the "update" and "fetch" are atomic.

diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs
@@ -599,7 +599,10 @@ async fn sic_alloc_server(
     let rss_ram = params.create_params.memory;
     let reservoir_ram = omicron_common::api::external::ByteCount::from(0);
 
-    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
+    // Use the instance's Propolis ID as its resource key, since each unique
+    // Propolis consumes its own resources, and an instance can have multiple
+    // Propolises during a live migration.
+    let propolis_id = sagactx.lookup::<Uuid>("propolis_id")?;
     let resources = db::model::Resources::new(
         hardware_threads.into(),
         rss_ram.into(),
@@ -609,7 +612,7 @@ async fn sic_alloc_server(
     let resource = osagactx
         .nexus()
         .reserve_on_random_sled(
-            instance_id,
+            propolis_id,
             db::model::SledResourceKind::Instance,
             resources,
         )
@@ -622,9 +625,9 @@ async fn sic_alloc_server_undo(
     sagactx: NexusActionContext,
 ) -> Result<(), anyhow::Error> {
     let osagactx = sagactx.user_data();
-    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
+    let propolis_id = sagactx.lookup::<Uuid>("propolis_id")?;
 
-    osagactx.nexus().delete_sled_reservation(instance_id).await?;
+    osagactx.nexus().delete_sled_reservation(propolis_id).await?;
     Ok(())
 }
 

diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs
@@ -292,9 +292,27 @@ async fn sid_account_sled_resources(
         &params.serialized_authn,
     );
 
+    // Fetch the previously-deleted instance record to get its Propolis ID. It
+    // is safe to fetch the ID at this point because the instance is already
+    // deleted and so cannot change anymore.
+    //
+    // TODO(#2315): This prevents the garbage collection of soft-deleted
+    // instance records. A better method is to remove a Propolis's reservation
+    // once an instance no longer refers to it (e.g. when it has stopped or
+    // been removed from the instance's migration information) and then make
+    // this saga check that the instance has no active Propolises before it is
+    // deleted. This logic should be part of the logic needed to stop an
+    // instance and release its Propolis reservation; when that is added this
+    // step can be removed.
+    let instance = osagactx
+        .datastore()
+        .instance_fetch_deleted(&opctx, &params.authz_instance)
+        .await
+        .map_err(ActionError::action_failed)?;
+
     osagactx
         .datastore()
-        .sled_reservation_delete(&opctx, params.instance.id())
+        .sled_reservation_delete(&opctx, instance.runtime().propolis_id)
         .await
         .map_err(ActionError::action_failed)?;
     Ok(())