Skip to content

Commit

Permalink
Use Propolis ID as VM sled resource key (#2840)
Browse files Browse the repository at this point in the history
Using instance IDs to reserve VMM resources on sleds is not quite
flexible enough, because a single instance can have multiple Propolis
VMMs. Use the Propolis ID as the key for these instead.

Add a function to allow the deletion saga to fetch a previously-deleted
instance record so that it can obtain this ID at the correct time.
Simply returning the deleted record from the delete-record step is
insufficient because the record- deleting step needs to be idempotent
and, if it runs more than once, may not find any record to delete and
return.

Tested: cargo test. Will also do some ad hoc VM creations/deletions on a
test cluster before merging.

Fixes #2839.

---------

Co-authored-by: Sean Klein <sean@oxide.computer>
  • Loading branch information
gjcolombo and smklein authored Apr 14, 2023
1 parent 2882a12 commit 8d11bf1
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 5 deletions.
30 changes: 30 additions & 0 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,36 @@ impl DataStore {
Ok(db_instance)
}

/// Fetches information about a deleted instance. This can be used to
/// query the properties an instance had at the time it was deleted, which
/// can be useful when cleaning up a deleted instance.
pub async fn instance_fetch_deleted(
&self,
opctx: &OpContext,
authz_instance: &authz::Instance,
) -> LookupResult<Instance> {
opctx.authorize(authz::Action::Read, authz_instance).await?;

use db::schema::instance::dsl;
let instance = dsl::instance
.filter(dsl::id.eq(authz_instance.id()))
.filter(dsl::time_deleted.is_not_null())
.select(Instance::as_select())
.get_result_async(self.pool_authorized(opctx).await?)
.await
.map_err(|e| {
public_error_from_diesel_pool(
e,
ErrorHandler::NotFoundByLookup(
ResourceType::Instance,
LookupType::ById(authz_instance.id()),
),
)
})?;

Ok(instance)
}

// TODO-design It's tempting to return the updated state of the Instance
// here because it's convenient for consumers and by using a RETURNING
// clause, we could ensure that the "update" and "fetch" are atomic.
Expand Down
11 changes: 7 additions & 4 deletions nexus/src/app/sagas/instance_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,10 @@ async fn sic_alloc_server(
let rss_ram = params.create_params.memory;
let reservoir_ram = omicron_common::api::external::ByteCount::from(0);

let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
// Use the instance's Propolis ID as its resource key, since each unique
// Propolis consumes its own resources, and an instance can have multiple
// Propolises during a live migration.
let propolis_id = sagactx.lookup::<Uuid>("propolis_id")?;
let resources = db::model::Resources::new(
hardware_threads.into(),
rss_ram.into(),
Expand All @@ -609,7 +612,7 @@ async fn sic_alloc_server(
let resource = osagactx
.nexus()
.reserve_on_random_sled(
instance_id,
propolis_id,
db::model::SledResourceKind::Instance,
resources,
)
Expand All @@ -622,9 +625,9 @@ async fn sic_alloc_server_undo(
sagactx: NexusActionContext,
) -> Result<(), anyhow::Error> {
let osagactx = sagactx.user_data();
let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
let propolis_id = sagactx.lookup::<Uuid>("propolis_id")?;

osagactx.nexus().delete_sled_reservation(instance_id).await?;
osagactx.nexus().delete_sled_reservation(propolis_id).await?;
Ok(())
}

Expand Down
20 changes: 19 additions & 1 deletion nexus/src/app/sagas/instance_delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,27 @@ async fn sid_account_sled_resources(
&params.serialized_authn,
);

// Fetch the previously-deleted instance record to get its Propolis ID. It
// is safe to fetch the ID at this point because the instance is already
// deleted and so cannot change anymore.
//
// TODO(#2315): This prevents the garbage collection of soft-deleted
// instance records. A better method is to remove a Propolis's reservation
// once an instance no longer refers to it (e.g. when it has stopped or
// been removed from the instance's migration information) and then make
// this saga check that the instance has no active Propolises before it is
// deleted. This logic should be part of the logic needed to stop an
// instance and release its Propolis reservation; when that is added this
// step can be removed.
let instance = osagactx
.datastore()
.instance_fetch_deleted(&opctx, &params.authz_instance)
.await
.map_err(ActionError::action_failed)?;

osagactx
.datastore()
.sled_reservation_delete(&opctx, params.instance.id())
.sled_reservation_delete(&opctx, instance.runtime().propolis_id)
.await
.map_err(ActionError::action_failed)?;
Ok(())
Expand Down

0 comments on commit 8d11bf1

Please sign in to comment.