Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into iliana/crdb-cluster-v…
Browse files Browse the repository at this point in the history
…ersion-v2
  • Loading branch information
iliana committed May 25, 2024
2 parents 7df2d80 + 4436dc1 commit 0871b51
Show file tree
Hide file tree
Showing 14 changed files with 665 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/hakari.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
with:
toolchain: stable
- name: Install cargo-hakari
uses: taiki-e/install-action@0fc560009ad92371154ca652dcf2620d19331eee # v2
uses: taiki-e/install-action@7491b900536dd0dae2e47ce7c17f140e46328dc4 # v2
with:
tool: cargo-hakari
- name: Check workspace-hack Cargo.toml is up-to-date
Expand Down
53 changes: 53 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -996,6 +996,59 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
eprintln!(" unexpected return value from task: {:?}", val)
}
};
} else if name == "abandoned_vmm_reaper" {
#[derive(Deserialize)]
struct TaskSuccess {
/// total number of abandoned VMMs found
found: usize,

/// number of abandoned VMM records that were deleted
vmms_deleted: usize,

/// number of abandoned VMM records that were already deleted when
/// we tried to delete them.
vmms_already_deleted: usize,

/// sled resource reservations that were released
sled_reservations_deleted: usize,

/// number of errors that occurred during the activation
error_count: usize,

/// the last error that occurred during execution.
error: Option<String>,
}
match serde_json::from_value::<TaskSuccess>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(TaskSuccess {
found,
vmms_deleted,
vmms_already_deleted,
sled_reservations_deleted,
error_count,
error,
}) => {
if let Some(error) = error {
println!(" task did not complete successfully!");
println!(" total errors: {error_count}");
println!(" most recent error: {error}");
}

println!(" total abandoned VMMs found: {found}");
println!(" VMM records deleted: {vmms_deleted}");
println!(
" VMM records already deleted by another Nexus: {}",
vmms_already_deleted,
);
println!(
" sled resource reservations deleted: {}",
sled_reservations_deleted,
);
}
};
} else {
println!(
"warning: unknown background task: {:?} \
Expand Down
15 changes: 15 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ EXECUTING COMMAND: omdb ["nexus", "--nexus-internal-url", "http://127.0.0.1:REDA
termination: Exited(0)
---------------------------------------------
stdout:
task: "abandoned_vmm_reaper"
deletes sled reservations for VMMs that have been abandoned by their
instances


task: "bfd_manager"
Manages bidirectional fowarding detection (BFD) configuration on rack
switches
Expand Down Expand Up @@ -140,6 +145,11 @@ EXECUTING COMMAND: omdb ["nexus", "background-tasks", "doc"]
termination: Exited(0)
---------------------------------------------
stdout:
task: "abandoned_vmm_reaper"
deletes sled reservations for VMMs that have been abandoned by their
instances


task: "bfd_manager"
Manages bidirectional fowarding detection (BFD) configuration on rack
switches
Expand Down Expand Up @@ -242,6 +252,11 @@ EXECUTING COMMAND: omdb ["--dns-server", "[::1]:REDACTED_PORT", "nexus", "backgr
termination: Exited(0)
---------------------------------------------
stdout:
task: "abandoned_vmm_reaper"
deletes sled reservations for VMMs that have been abandoned by their
instances


task: "bfd_manager"
Manages bidirectional fowarding detection (BFD) configuration on rack
switches
Expand Down
15 changes: 15 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ EXECUTING COMMAND: omdb ["nexus", "background-tasks", "doc"]
termination: Exited(0)
---------------------------------------------
stdout:
task: "abandoned_vmm_reaper"
deletes sled reservations for VMMs that have been abandoned by their
instances


task: "bfd_manager"
Manages bidirectional fowarding detection (BFD) configuration on rack
switches
Expand Down Expand Up @@ -380,6 +385,16 @@ task: "blueprint_executor"
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last completion reported error: no blueprint

task: "abandoned_vmm_reaper"
configured period: every 1m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total abandoned VMMs found: 0
VMM records deleted: 0
VMM records already deleted by another Nexus: 0
sled resource reservations deleted: 0

task: "bfd_manager"
configured period: every 30s
currently executing: no
Expand Down
15 changes: 15 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,8 @@ pub struct BackgroundTaskConfig {
pub service_firewall_propagation: ServiceFirewallPropagationConfig,
/// configuration for v2p mapping propagation task
pub v2p_mapping_propagation: V2PMappingPropagationConfig,
/// configuration for abandoned VMM reaper task
pub abandoned_vmm_reaper: AbandonedVmmReaperConfig,
}

#[serde_as]
Expand Down Expand Up @@ -549,6 +551,14 @@ pub struct V2PMappingPropagationConfig {
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct AbandonedVmmReaperConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

/// Configuration for a nexus server
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct PackageConfig {
Expand Down Expand Up @@ -788,6 +798,7 @@ mod test {
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -926,6 +937,9 @@ mod test {
v2p_mapping_propagation: V2PMappingPropagationConfig {
period_secs: Duration::from_secs(30)
},
abandoned_vmm_reaper: AbandonedVmmReaperConfig {
period_secs: Duration::from_secs(60),
}
},
default_region_allocation_strategy:
crate::nexus_config::RegionAllocationStrategy::Random {
Expand Down Expand Up @@ -995,6 +1009,7 @@ mod test {
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
[default_region_allocation_strategy]
type = "random"
"##,
Expand Down
77 changes: 72 additions & 5 deletions nexus/db-queries/src/db/datastore/vmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,21 @@ use crate::authz;
use crate::context::OpContext;
use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::model::InstanceState as DbInstanceState;
use crate::db::model::Vmm;
use crate::db::model::VmmRuntimeState;
use crate::db::pagination::paginated;
use crate::db::schema::vmm::dsl;
use crate::db::update_and_check::UpdateAndCheck;
use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::Utc;
use diesel::prelude::*;
use omicron_common::api::external::CreateResult;
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::Error;
use omicron_common::api::external::InstanceState as ApiInstanceState;
use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::LookupResult;
use omicron_common::api::external::LookupType;
use omicron_common::api::external::ResourceType;
Expand Down Expand Up @@ -50,9 +55,6 @@ impl DataStore {
opctx: &OpContext,
vmm_id: &Uuid,
) -> UpdateResult<bool> {
use crate::db::model::InstanceState as DbInstanceState;
use omicron_common::api::external::InstanceState as ApiInstanceState;

let valid_states = vec![
DbInstanceState::new(ApiInstanceState::Destroyed),
DbInstanceState::new(ApiInstanceState::Failed),
Expand All @@ -61,9 +63,15 @@ impl DataStore {
let updated = diesel::update(dsl::vmm)
.filter(dsl::id.eq(*vmm_id))
.filter(dsl::state.eq_any(valid_states))
.filter(dsl::time_deleted.is_null())
.set(dsl::time_deleted.eq(Utc::now()))
.execute_async(&*self.pool_connection_authorized(opctx).await?)
.check_if_exists::<Vmm>(*vmm_id)
.execute_and_check(&*self.pool_connection_authorized(opctx).await?)
.await
.map(|r| match r.status {
UpdateStatus::Updated => true,
UpdateStatus::NotUpdatedButExists => false,
})
.map_err(|e| {
public_error_from_diesel(
e,
Expand All @@ -74,7 +82,7 @@ impl DataStore {
)
})?;

Ok(updated != 0)
Ok(updated)
}

pub async fn vmm_fetch(
Expand Down Expand Up @@ -164,4 +172,63 @@ impl DataStore {

Ok(vmm)
}

/// Lists VMMs which have been abandoned by their instances after a
/// migration and are in need of cleanup.
///
/// A VMM is considered "abandoned" if (and only if):
///
/// - It is in the `Destroyed` state.
/// - It is not currently running an instance, and it is also not the
/// migration target of any instance (i.e. it is not pointed to by
/// any instance record's `active_propolis_id` and `target_propolis_id`
/// fields).
/// - It has not been deleted yet.
pub async fn vmm_list_abandoned(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, Uuid>,
) -> ListResultVec<Vmm> {
use crate::db::schema::instance::dsl as instance_dsl;
let destroyed = DbInstanceState::new(ApiInstanceState::Destroyed);
paginated(dsl::vmm, dsl::id, pagparams)
// In order to be considered "abandoned", a VMM must be:
// - in the `Destroyed` state
.filter(dsl::state.eq(destroyed))
// - not deleted yet
.filter(dsl::time_deleted.is_null())
// - not pointed to by any instance's `active_propolis_id` or
// `target_propolis_id`.
//
.left_join(
// Left join with the `instance` table on the VMM's instance ID, so
// that we can check if the instance pointed to by this VMM (if
// any exists) has this VMM pointed to by its
// `active_propolis_id` or `target_propolis_id` fields.
instance_dsl::instance
.on(instance_dsl::id.eq(dsl::instance_id)),
)
.filter(
dsl::id
.nullable()
.ne(instance_dsl::active_propolis_id)
// In SQL, *all* comparisons with NULL are `false`, even `!=
// NULL`, so we have to explicitly check for nulls here, or
// else VMMs whose instances have no `active_propolis_id`
// will not be considered abandoned (incorrectly).
.or(instance_dsl::active_propolis_id.is_null()),
)
.filter(
dsl::id
.nullable()
.ne(instance_dsl::target_propolis_id)
// As above, we must add this clause because SQL nulls have
// the most irritating behavior possible.
.or(instance_dsl::target_propolis_id.is_null()),
)
.select(Vmm::as_select())
.load_async(&*self.pool_connection_authorized(opctx).await?)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}
}
1 change: 1 addition & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ region_replacement.period_secs = 30
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60

[default_region_allocation_strategy]
# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
Expand Down
Loading

0 comments on commit 0871b51

Please sign in to comment.