From e6b489adb65af2555de6de74e3c32d1247387eef Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 26 Mar 2025 22:14:34 +0000 Subject: [PATCH 01/12] Account for Crucible Agent reservation overhead When creating a region's dataset, the Crucible Agent will include a reservation 25% larger than the region's size to account for on-disk overhead (storing encryption contexts and other metadata). Nexus does not take this overhead into account when computing `size_used` for crucible_dataset rows, or when allocating regions. This leads to the scenario where Nexus thinks there's enough room for a region but the Agent will fail to create the dataset due to not having enough space for the reservation to succeed. Fix this: add a reservation factor column to the Region model, and account for this when performing region allocation and when computing the `size_used` column for crucible datasets. This commit also adds an upgrader that will set all currently allocated Region's reservation factor to 1.25, and recompute all the `size_used` values for all non-deleted crucible datasets. This may lead to `size_used` being greater than the pool's total_size - a follow up commit will add an omdb command to identify these cases, and identify candidate regions to request replacement for in order to remedy this. The `regions_hard_delete` function now uses this upgrader's CTE to set `size_used` for all crucible datasets at once, instead of in a for loop during an interactive transaction. --- nexus/db-model/src/region.rs | 23 ++ nexus/db-model/src/schema.rs | 2 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/region.rs | 119 +++-------- nexus/db-queries/src/db/datastore/volume.rs | 2 + .../src/db/queries/region_allocation.rs | 26 ++- .../output/region_allocate_distinct_sleds.sql | 44 ++-- .../output/region_allocate_random_sleds.sql | 44 ++-- ..._allocate_with_snapshot_distinct_sleds.sql | 44 ++-- ...on_allocate_with_snapshot_random_sleds.sql | 44 ++-- .../execution/src/omicron_physical_disks.rs | 1 + .../tasks/decommissioned_disk_cleaner.rs | 1 + .../tasks/region_replacement_driver.rs | 6 + nexus/src/app/sagas/disk_create.rs | 2 +- .../app/sagas/region_replacement_finish.rs | 1 + .../src/app/sagas/region_replacement_start.rs | 6 +- nexus/test-utils/src/resource_helpers.rs | 2 +- nexus/tests/integration_tests/disks.rs | 95 ++++----- nexus/tests/integration_tests/schema.rs | 197 ++++++++++++++++++ nexus/tests/integration_tests/snapshots.rs | 10 +- .../up01.sql | 2 + .../up02.sql | 2 + .../up03.sql | 23 ++ schema/crdb/dbinit.sql | 6 +- 24 files changed, 480 insertions(+), 225 deletions(-) create mode 100644 schema/crdb/crucible-agent-reservation-overhead/up01.sql create mode 100644 schema/crdb/crucible-agent-reservation-overhead/up02.sql create mode 100644 schema/crdb/crucible-agent-reservation-overhead/up03.sql diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index c39301b1b83..deb05f1e742 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -55,9 +55,16 @@ pub struct Region { // Shared read-only regions require a "deleting" flag to avoid a // use-after-free scenario deleting: bool, + + // The Agent will reserve space for Downstairs overhead when creating the + // corresponding ZFS dataset. Nexus has to account for that: store that + // reservation factor here as it may change in the future, and it can be + // used during Crucible related accounting. + reservation_factor: f64, } impl Region { + #[allow(clippy::too_many_arguments)] pub fn new( dataset_id: DatasetUuid, volume_id: VolumeUuid, @@ -66,6 +73,7 @@ impl Region { extent_count: u64, port: u16, read_only: bool, + reservation_factor: f64, ) -> Self { Self { identity: RegionIdentity::new(Uuid::new_v4()), @@ -77,6 +85,7 @@ impl Region { port: Some(port.into()), read_only, deleting: false, + reservation_factor, } } @@ -112,4 +121,18 @@ impl Region { pub fn deleting(&self) -> bool { self.deleting } + + /// The size of the Region without accounting for any overhead + pub fn requested_size(&self) -> u64 { + self.block_size().to_bytes() + * self.blocks_per_extent() + * self.extent_count() + } + + /// The size the Crucible agent would have reserved during ZFS creation, + /// which is some factor higher than the requested region size to account + /// for on-disk overhead. + pub fn reserved_size(&self) -> u64 { + (self.requested_size() as f64 * self.reservation_factor).round() as u64 + } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index feec853383f..6e6645ccd58 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1125,6 +1125,8 @@ table! { read_only -> Bool, deleting -> Bool, + + reservation_factor -> Float8, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1a149fb3767..81a8a9a91c4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(132, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(133, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(133, "crucible-agent-reservation-overhead"), KnownVersion::new(132, "bp-omicron-zone-filesystem-pool-not-null"), KnownVersion::new(131, "tuf-generation"), KnownVersion::new(130, "bp-sled-agent-generation"), diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 2e27ed6a17d..73049082184 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -23,8 +23,8 @@ use crate::db::pagination::paginated; use crate::db::queries::region_allocation::RegionParameters; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; -use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::dsl::sql_query; use diesel::prelude::*; use nexus_config::RegionAllocationStrategy; use nexus_types::external_api::params; @@ -301,68 +301,26 @@ impl DataStore { return Ok(()); } - #[derive(Debug, thiserror::Error)] - enum RegionDeleteError { - #[error("Numeric error: {0}")] - NumericError(String), - } - let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("regions_hard_delete") .transaction(&conn, |conn| { - let err = err.clone(); let region_ids = region_ids.clone(); async move { - use db::schema::crucible_dataset::dsl as dataset_dsl; - use db::schema::region::dsl as region_dsl; + use db::schema::region::dsl as dsl; - // Remove the regions, collecting datasets they're from. - let datasets = diesel::delete(region_dsl::region) - .filter(region_dsl::id.eq_any(region_ids)) - .returning(region_dsl::dataset_id) - .get_results_async::(&conn).await?; + // Remove the regions + diesel::delete(dsl::region) + .filter(dsl::id.eq_any(region_ids)) + .execute_async(&conn) + .await?; // Update datasets to which the regions belonged. - for dataset in datasets { - let dataset_total_occupied_size: Option< - diesel::pg::data_types::PgNumeric, - > = region_dsl::region - .filter(region_dsl::dataset_id.eq(dataset)) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&conn).await?; - - let dataset_total_occupied_size: i64 = if let Some( - dataset_total_occupied_size, - ) = - dataset_total_occupied_size - { - let dataset_total_occupied_size: db::model::ByteCount = - dataset_total_occupied_size.try_into().map_err( - |e: anyhow::Error| { - err.bail(RegionDeleteError::NumericError( - e.to_string(), - )) - }, - )?; - - dataset_total_occupied_size.into() - } else { - 0 - }; - - diesel::update(dataset_dsl::crucible_dataset) - .filter(dataset_dsl::id.eq(dataset)) - .set( - dataset_dsl::size_used - .eq(dataset_total_occupied_size), - ) - .execute_async(&conn).await?; - } + // XXX put this file somewhere else + sql_query(include_str!( + "../../../../../schema/crdb/crucible-agent-reservation-overhead/up03.sql" + )) + .execute_async(&conn) + .await?; // Whenever a region is hard-deleted, validate invariants // for all volumes @@ -373,52 +331,25 @@ impl DataStore { } }) .await - .map_err(|e| { - if let Some(err) = err.take() { - match err { - RegionDeleteError::NumericError(err) => { - return Error::internal_error( - &format!("Transaction error: {}", err) - ); - } - } - } - public_error_from_diesel(e, ErrorHandler::Server) - }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Return the total occupied size for a dataset - pub async fn regions_total_occupied_size( + /// Return the total reserved size for all the regions allocated to a + /// dataset + pub async fn regions_total_reserved_size( &self, dataset_id: DatasetUuid, ) -> Result { - use db::schema::region::dsl as region_dsl; - - let total_occupied_size: Option = - region_dsl::region - .filter(region_dsl::dataset_id.eq(to_db_typed_uuid(dataset_id))) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + use db::schema::region::dsl; - if let Some(total_occupied_size) = total_occupied_size { - let total_occupied_size: db::model::ByteCount = - total_occupied_size.try_into().map_err( - |e: anyhow::Error| Error::internal_error(&e.to_string()), - )?; + let dataset_regions: Vec = dsl::region + .filter(dsl::dataset_id.eq(to_db_typed_uuid(dataset_id))) + .select(Region::as_select()) + .load_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(total_occupied_size.to_bytes()) - } else { - Ok(0) - } + Ok(dataset_regions.iter().map(|r| r.reserved_size()).sum()) } /// Find read/write regions on expunged disks diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 012b70559c5..d68264c0e23 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -4715,6 +4715,7 @@ mod tests { region.extent_count(), 111, false, // read-write + 1.0, ); use nexus_db_model::schema::region::dsl; @@ -4952,6 +4953,7 @@ mod tests { region.extent_count(), 111, true, // read-only + 1.0, ); use nexus_db_model::schema::region::dsl; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 2a385ade957..d538d3a114c 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -117,8 +117,14 @@ pub fn allocation_query( let seed = seed.to_le_bytes().to_vec(); - let size_delta = + // The Crucible Agent's current reservation factor + const RESERVATION_FACTOR: f64 = 1.25; + + let requested_size: u64 = params.block_size * params.blocks_per_extent * params.extent_count; + let size_delta: u64 = + (requested_size as f64 * RESERVATION_FACTOR).round() as u64; + let redundancy: i64 = i64::try_from(redundancy).unwrap(); let mut builder = QueryBuilder::new(); @@ -272,7 +278,8 @@ pub fn allocation_query( ").param().sql(" AS extent_count, NULL AS port, ").param().sql(" AS read_only, - FALSE as deleting + FALSE as deleting, + ").param().sql(" AS reservation_factor FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -286,6 +293,7 @@ pub fn allocation_query( .bind::(params.blocks_per_extent as i64) .bind::(params.extent_count as i64) .bind::(params.read_only) + .bind::(RESERVATION_FACTOR) .bind::(redundancy) // A subquery which summarizes the changes we intend to make, showing: @@ -298,7 +306,17 @@ pub fn allocation_query( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta + CAST( + CAST( + candidate_regions.block_size * + candidate_regions.blocks_per_extent * + candidate_regions.extent_count + AS FLOAT + ) + * candidate_regions.reservation_factor + AS INT + ) + AS size_used_delta FROM (candidate_regions INNER JOIN crucible_dataset ON (crucible_dataset.id = candidate_regions.dataset_id)) ),") @@ -385,7 +403,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting, reservation_factor) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index 81741bf8322..faa29b5bb1b 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor FROM region WHERE @@ -113,20 +114,28 @@ WITH $10 AS extent_count, NULL AS port, $11 AS read_only, - false AS deleting + false AS deleting, + $12 AS reservation_factor FROM shuffled_candidate_datasets LIMIT - $12 - (SELECT count(*) FROM old_regions) + $13 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count + CAST( + CAST( + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS FLOAT8 + ) + * candidate_regions.reservation_factor + AS INT8 + ) AS size_used_delta FROM candidate_regions @@ -137,7 +146,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $13 + (SELECT count(*) FROM old_regions LIMIT 1) < $14 AND CAST( IF( ( @@ -147,7 +156,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $14 + >= $15 ), 'TRUE', 'Not enough space' @@ -164,7 +173,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough datasets' @@ -203,7 +212,7 @@ WITH 1 ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough unique zpools selected' @@ -228,7 +237,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_factor ) SELECT candidate_regions.id, @@ -241,7 +251,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_factor FROM candidate_regions WHERE @@ -257,7 +268,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor ), updated_datasets AS ( @@ -311,7 +323,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_factor FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -337,7 +350,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_factor FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index dcfa9736a9b..ac7b83610a9 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor FROM region WHERE @@ -101,20 +102,28 @@ WITH $9 AS extent_count, NULL AS port, $10 AS read_only, - false AS deleting + false AS deleting, + $11 AS reservation_factor FROM shuffled_candidate_datasets LIMIT - $11 - (SELECT count(*) FROM old_regions) + $12 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count + CAST( + CAST( + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS FLOAT8 + ) + * candidate_regions.reservation_factor + AS INT8 + ) AS size_used_delta FROM candidate_regions @@ -125,7 +134,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $12 + (SELECT count(*) FROM old_regions LIMIT 1) < $13 AND CAST( IF( ( @@ -135,7 +144,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $13 + >= $14 ), 'TRUE', 'Not enough space' @@ -152,7 +161,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $14 + >= $15 ), 'TRUE', 'Not enough datasets' @@ -191,7 +200,7 @@ WITH 1 ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough unique zpools selected' @@ -216,7 +225,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_factor ) SELECT candidate_regions.id, @@ -229,7 +239,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_factor FROM candidate_regions WHERE @@ -245,7 +256,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor ), updated_datasets AS ( @@ -299,7 +311,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_factor FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -325,7 +338,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_factor FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 856e1ec3f41..1364d8fca6a 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor FROM region WHERE @@ -125,20 +126,28 @@ WITH $11 AS extent_count, NULL AS port, $12 AS read_only, - false AS deleting + false AS deleting, + $13 AS reservation_factor FROM shuffled_candidate_datasets LIMIT - $13 - (SELECT count(*) FROM old_regions) + $14 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count + CAST( + CAST( + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS FLOAT8 + ) + * candidate_regions.reservation_factor + AS INT8 + ) AS size_used_delta FROM candidate_regions @@ -149,7 +158,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $14 + (SELECT count(*) FROM old_regions LIMIT 1) < $15 AND CAST( IF( ( @@ -159,7 +168,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough space' @@ -176,7 +185,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough datasets' @@ -215,7 +224,7 @@ WITH 1 ) ) - >= $17 + >= $18 ), 'TRUE', 'Not enough unique zpools selected' @@ -240,7 +249,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_factor ) SELECT candidate_regions.id, @@ -253,7 +263,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_factor FROM candidate_regions WHERE @@ -269,7 +280,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor ), updated_datasets AS ( @@ -323,7 +335,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_factor FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -349,7 +362,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_factor FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index 60637b53778..9d4e390809c 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor FROM region WHERE @@ -113,20 +114,28 @@ WITH $10 AS extent_count, NULL AS port, $11 AS read_only, - false AS deleting + false AS deleting, + $12 AS reservation_factor FROM shuffled_candidate_datasets LIMIT - $12 - (SELECT count(*) FROM old_regions) + $13 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count + CAST( + CAST( + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS FLOAT8 + ) + * candidate_regions.reservation_factor + AS INT8 + ) AS size_used_delta FROM candidate_regions @@ -137,7 +146,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $13 + (SELECT count(*) FROM old_regions LIMIT 1) < $14 AND CAST( IF( ( @@ -147,7 +156,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $14 + >= $15 ), 'TRUE', 'Not enough space' @@ -164,7 +173,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough datasets' @@ -203,7 +212,7 @@ WITH 1 ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough unique zpools selected' @@ -228,7 +237,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_factor ) SELECT candidate_regions.id, @@ -241,7 +251,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_factor FROM candidate_regions WHERE @@ -257,7 +268,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_factor ), updated_datasets AS ( @@ -311,7 +323,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_factor FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -337,7 +350,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_factor FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs index 941ad553392..cc97f1b7cf6 100644 --- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs +++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs @@ -164,6 +164,7 @@ mod test { 10, 1, false, + 1.0, ) }; let conn = datastore.pool_connection_for_tests().await.unwrap(); diff --git a/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs index b362f467b11..20ff9326dc9 100644 --- a/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs +++ b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs @@ -256,6 +256,7 @@ mod tests { 10, 1, false, + 1.0, ) }; let region_id = region.id(); diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index 6586d214b18..3ff7c1c9868 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -362,6 +362,7 @@ mod test { 10, 27015, false, + 1.0, ) }; @@ -376,6 +377,7 @@ mod test { 10, 27016, false, + 1.0, ) }; @@ -474,6 +476,7 @@ mod test { 10, 27015, false, + 1.0, ) }; @@ -488,6 +491,7 @@ mod test { 10, 27016, false, + 1.0, ) }; @@ -630,6 +634,7 @@ mod test { 10, 27015, false, + 1.0, ) }; @@ -644,6 +649,7 @@ mod test { 10, 27016, false, + 1.0, ) }; diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index fb9aa623dd3..b3a08ff722a 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -1014,7 +1014,7 @@ pub(crate) mod test { for zpool in test.zpools() { for dataset in &zpool.datasets { if datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap() != 0 diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index 47d88159428..65bfe3114fe 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -253,6 +253,7 @@ pub(crate) mod test { 10, 12345, false, + 1.0, ) }; diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index f38f2ad5f33..7b17c8cd85d 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -926,6 +926,7 @@ pub(crate) mod test { 10, 1001, false, + 1.0, ), Region::new( datasets[1].id(), @@ -935,6 +936,7 @@ pub(crate) mod test { 10, 1002, false, + 1.0, ), Region::new( datasets[2].id(), @@ -944,6 +946,7 @@ pub(crate) mod test { 10, 1003, false, + 1.0, ), Region::new( datasets[3].id(), @@ -953,6 +956,7 @@ pub(crate) mod test { 10, 1004, false, + 1.0, ), ]; @@ -1027,7 +1031,7 @@ pub(crate) mod test { for zpool in test.zpools() { for dataset in &zpool.datasets { if datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap() != 0 diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index bc98b93b8f9..80258e5b507 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1184,7 +1184,7 @@ pub struct DiskTest<'a, N: NexusServer> { } impl<'a, N: NexusServer> DiskTest<'a, N> { - pub const DEFAULT_ZPOOL_SIZE_GIB: u32 = 10; + pub const DEFAULT_ZPOOL_SIZE_GIB: u32 = 16; pub const DEFAULT_ZPOOL_COUNT: u32 = 3; /// Creates a new [DiskTest] with default configuration. diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 1951a17bf91..29e3d7cd865 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -998,13 +998,13 @@ async fn test_disk_backed_by_multiple_region_sets( ) { let client = &cptestctx.external_client; - // Create three zpools, all 10 gibibytes, each with one dataset + // Create three zpools, each with one dataset let mut test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create another three zpools, all 10 gibibytes, each with one dataset + // Create another three zpools, each with one dataset test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; @@ -1044,10 +1044,10 @@ async fn test_disk_too_big(cptestctx: &ControlPlaneTestContext) { DiskTest::new(&cptestctx).await; create_project_and_pool(client).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Ask for a 300 gibibyte disk (but only 10 is available) + // Ask for a 300 gibibyte disk (but only 16 is available) let disk_size = ByteCount::from_gibibytes_u32(300); let disks_url = get_disks_url(); let new_disk = params::DiskCreate { @@ -1546,11 +1546,11 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore(); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); create_project_and_pool(client).await; @@ -1559,11 +1559,20 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), 0 ); + + assert_eq!( + datastore + .crucible_dataset_get(dataset.id) + .await + .unwrap() + .size_used, + 0, + ); } } @@ -1594,21 +1603,22 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { // Total occupied size is 7 GiB * 3 (each Crucible disk requires three // regions to make a region set for an Upstairs, one region per dataset) + // plus reservation overhead for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(7).to_bytes(), + ByteCount::from_mebibytes_u32(8960).to_bytes(), ); } } - // Ask for a 4 gibibyte disk, this should fail because there isn't space + // Ask for a 6 gibibyte disk, this should fail because there isn't space // available. - let disk_size = ByteCount::from_gibibytes_u32(4); + let disk_size = ByteCount::from_gibibytes_u32(6); let disk_two = params::DiskCreate { identity: IdentityMetadataCreateParams { name: "disk-two".parse().unwrap(), @@ -1628,17 +1638,17 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .expect("unexpected success creating 4 GiB disk"); + .expect("unexpected success creating 6 GiB disk"); - // Total occupied size is still 7 GiB * 3 + // Total occupied size is still 7 GiB * 3 (plus overhead) for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(7).to_bytes(), + ByteCount::from_mebibytes_u32(8960).to_bytes(), ); } } @@ -1659,7 +1669,7 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), 0, @@ -1690,15 +1700,15 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { .await .expect("unexpected failure creating 10 GiB disk"); - // Total occupied size should be 10 GiB * 3 + // Total occupied size should be 10 GiB * 3 plus overhead for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(10).to_bytes(), + ByteCount::from_mebibytes_u32(12800).to_bytes(), ); } } @@ -1711,16 +1721,13 @@ async fn test_multiple_disks_multiple_zpools( ) { let client = &cptestctx.external_client; - // Create six 10 GB zpools, each with one dataset + // Create six zpools, each with one dataset let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(6) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - create_project_and_pool(client).await; // Ask for a 10 gibibyte disk, this should succeed @@ -2086,12 +2093,9 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Allocate a single 1 GB region let volume_id = VolumeUuid::new_v4(); @@ -2133,11 +2137,11 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { for zpool in disk_test.zpools() { for dataset in &zpool.datasets { let total_size = datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(); - if total_size == 1073741824 { + if total_size == allocated_region.reserved_size() { number_of_matching_regions += 1; } else if total_size == 0 { // ok, unallocated @@ -2160,16 +2164,13 @@ async fn test_region_allocation_strategy_random_is_idempotent( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(4) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2230,16 +2231,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(4) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Call region allocation in isolation let volume_id = VolumeUuid::new_v4(); @@ -2293,7 +2291,7 @@ async fn test_single_region_allocate_for_replace( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. // // We add one more then the "three" default to meet `region_allocate`'s // redundancy requirements. @@ -2303,9 +2301,6 @@ async fn test_single_region_allocate_for_replace( .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2387,12 +2382,9 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2558,12 +2550,9 @@ async fn test_disk_expunge(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 47c99b0a232..a1848676e60 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -1753,6 +1753,199 @@ fn after_125_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } +fn before_133_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async { + // To test the size_used upgrader, create a few crucible datasets and + // regions + + // First, a crucible dataset with no regions + let dataset_id: Uuid = + "00000001-0000-0000-0000-000000000000".parse().unwrap(); + let pool_id = Uuid::new_v4(); + let size_used = 0; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + + // Then, a crucible dataset with 1 region + let dataset_id: Uuid = + "00000002-0000-0000-0000-000000000000".parse().unwrap(); + + let region_id = Uuid::new_v4(); + let pool_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + let block_size = 512; + let blocks_per_extent = 64; + let extent_count = 1000; + + ctx.client + .batch_execute(&format!( + "INSERT INTO region VALUES ( + '{region_id}', + now(), + now(), + '{dataset_id}', + '{volume_id}', + {block_size}, + {blocks_per_extent}, + {extent_count}, + 5000, + false, + false + )" + )) + .await + .expect("inserted"); + + let size_used = block_size * blocks_per_extent * extent_count; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + + // Finally, a crucible dataset with 3 regions + let dataset_id: Uuid = + "00000003-0000-0000-0000-000000000000".parse().unwrap(); + let pool_id = Uuid::new_v4(); + + let block_size = 512; + let blocks_per_extent = 64; + let extent_count = 7000; + + for _ in 0..3 { + let region_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + ctx.client + .batch_execute(&format!( + "INSERT INTO region VALUES ( + '{region_id}', + now(), + now(), + '{dataset_id}', + '{volume_id}', + {block_size}, + {blocks_per_extent}, + {extent_count}, + 5000, + false, + false + )" + )) + .await + .expect("inserted"); + } + + let size_used = 3 * block_size * blocks_per_extent * extent_count; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + }) +} + +fn after_133_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async { + // The first crucible dataset still has size_used = 0 + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000001-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 0i64)], + ); + + // Note: the default crucible reservation factor is 1.25 + + // The second crucible dataset has + // size_used = 1.25 * (512 * 64 * 1000) + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000002-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 40960000i64)], + ); + + // The third crucible dataset has + // size_used = 1.25 * (3 * 512 * 64 * 7000) + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000003-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 860160000i64)], + ); + }) +} + // Lazily initializes all migration checks. The combination of Rust function // pointers and async makes defining a static table fairly painful, so we're // using lazy initialization instead. @@ -1801,6 +1994,10 @@ fn get_migration_checks() -> BTreeMap { Version::new(125, 0, 0), DataMigrationFns::new().before(before_125_0_0).after(after_125_0_0), ); + map.insert( + Version::new(133, 0, 0), + DataMigrationFns::new().before(before_133_0_0).after(after_133_0_0), + ); map } diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index ccad463f611..a4ac382b53a 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -1459,7 +1459,7 @@ async fn test_region_allocation_for_snapshot( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. // // We add one more than the "three" default to avoid failing // with "not enough storage". @@ -1470,9 +1470,6 @@ async fn test_region_allocation_for_snapshot( .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -1654,12 +1651,9 @@ async fn test_snapshot_expunge(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk, then a snapshot of that disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up01.sql b/schema/crdb/crucible-agent-reservation-overhead/up01.sql new file mode 100644 index 00000000000..5fbc44f7a26 --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up01.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.region + ADD COLUMN IF NOT EXISTS reservation_factor FLOAT NOT NULL DEFAULT 1.25; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up02.sql b/schema/crdb/crucible-agent-reservation-overhead/up02.sql new file mode 100644 index 00000000000..a86a7cd4e00 --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.region + ALTER COLUMN reservation_factor DROP DEFAULT; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up03.sql b/schema/crdb/crucible-agent-reservation-overhead/up03.sql new file mode 100644 index 00000000000..fb52776a5fe --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up03.sql @@ -0,0 +1,23 @@ +WITH size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CAST( + CAST(block_size * blocks_per_extent * extent_count AS FLOAT) + * reservation_factor + AS INT + ) + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE crucible_dataset.time_deleted IS NULL + GROUP BY crucible_dataset.id +) +UPDATE crucible_dataset +SET size_used = size_used_with_reservation.reserved_size +FROM size_used_with_reservation +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index fcfbf461d0a..9eaeaf872e2 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -639,7 +639,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( read_only BOOL NOT NULL, - deleting BOOL NOT NULL + deleting BOOL NOT NULL, + + reservation_factor FLOAT NOT NULL ); /* @@ -5001,7 +5003,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '132.0.0', NULL) + (TRUE, NOW(), NOW(), '133.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 95696ef00636f16345ed388457407ffbc76e81bf Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 28 Mar 2025 20:34:26 +0000 Subject: [PATCH 02/12] avoid floating point math reservation right now is 25%, which means that the requested size of a region can be divided by 4. avoid floating point math where possible. change the reservation percentage stored with the region to an enum, where the only value is 25%. this limits what can be done with manual database edits, and restricts what the Region::reserved_size function has to guard against. it'd be nice if Region::new was a test-only function but the crate doesn't have the same idea of a integration test feature. --- nexus/db-model/src/region.rs | 38 ++- nexus/db-model/src/schema.rs | 2 +- nexus/db-queries/src/db/datastore/region.rs | 4 +- nexus/db-queries/src/db/datastore/volume.rs | 2 - .../src/db/queries/region_allocation.rs | 227 +++++++++++++++--- .../output/region_allocate_distinct_sleds.sql | 34 +-- .../output/region_allocate_random_sleds.sql | 34 +-- ..._allocate_with_snapshot_distinct_sleds.sql | 34 +-- ...on_allocate_with_snapshot_random_sleds.sql | 34 +-- .../execution/src/omicron_physical_disks.rs | 1 - .../tasks/decommissioned_disk_cleaner.rs | 1 - .../tasks/region_replacement_driver.rs | 6 - .../app/sagas/region_replacement_finish.rs | 1 - .../src/app/sagas/region_replacement_start.rs | 4 - .../up01.sql | 5 +- .../up02.sql | 2 +- .../up03.sql | 25 +- .../up04.sql | 23 ++ schema/crdb/dbinit.sql | 6 +- 19 files changed, 313 insertions(+), 170 deletions(-) create mode 100644 schema/crdb/crucible-agent-reservation-overhead/up04.sql diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index deb05f1e742..d00b0d3e174 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -4,6 +4,7 @@ use super::ByteCount; use crate::SqlU16; +use crate::impl_enum_type; use crate::schema::region; use crate::typed_uuid::DbTypedUuid; use db_macros::Asset; @@ -15,6 +16,19 @@ use omicron_uuid_kinds::VolumeUuid; use serde::{Deserialize, Serialize}; use uuid::Uuid; +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "region_reservation_percent", schema = "public"))] + pub struct RegionReservationPercentEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = RegionReservationPercentEnum)] + pub enum RegionReservationPercent; + + // Enum values + TwentyFive => b"25" +); + /// Database representation of a Region. /// /// A region represents a portion of a Crucible Downstairs dataset @@ -58,13 +72,13 @@ pub struct Region { // The Agent will reserve space for Downstairs overhead when creating the // corresponding ZFS dataset. Nexus has to account for that: store that - // reservation factor here as it may change in the future, and it can be - // used during Crucible related accounting. - reservation_factor: f64, + // reservation percent here as it may change in the future, and it can be + // used during Crucible related accounting. This is stored as an enum to + // restrict the values to what the Crucible Agent uses. + reservation_percent: RegionReservationPercent, } impl Region { - #[allow(clippy::too_many_arguments)] pub fn new( dataset_id: DatasetUuid, volume_id: VolumeUuid, @@ -73,7 +87,6 @@ impl Region { extent_count: u64, port: u16, read_only: bool, - reservation_factor: f64, ) -> Self { Self { identity: RegionIdentity::new(Uuid::new_v4()), @@ -85,7 +98,10 @@ impl Region { port: Some(port.into()), read_only, deleting: false, - reservation_factor, + // When the Crucible agent's reservation percentage changes, this + // function should accept that as argument. Until then, it can only + // ever be 25%. + reservation_percent: RegionReservationPercent::TwentyFive, } } @@ -122,7 +138,9 @@ impl Region { self.deleting } - /// The size of the Region without accounting for any overhead + /// The size of the Region without accounting for any overhead. The + /// `allocation_query` function should have validated that this won't + /// overflow. pub fn requested_size(&self) -> u64 { self.block_size().to_bytes() * self.blocks_per_extent() @@ -133,6 +151,10 @@ impl Region { /// which is some factor higher than the requested region size to account /// for on-disk overhead. pub fn reserved_size(&self) -> u64 { - (self.requested_size() as f64 * self.reservation_factor).round() as u64 + let overhead = match &self.reservation_percent { + RegionReservationPercent::TwentyFive => self.requested_size() / 4, + }; + + self.requested_size() + overhead } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 6e6645ccd58..ed05dfa6361 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1126,7 +1126,7 @@ table! { deleting -> Bool, - reservation_factor -> Float8, + reservation_percent -> crate::RegionReservationPercentEnum, } } diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 73049082184..d037c1cbecb 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -269,7 +269,7 @@ impl DataStore { }, allocation_strategy, num_regions_required, - ); + )?; let conn = self.pool_connection_authorized(&opctx).await?; @@ -317,7 +317,7 @@ impl DataStore { // Update datasets to which the regions belonged. // XXX put this file somewhere else sql_query(include_str!( - "../../../../../schema/crdb/crucible-agent-reservation-overhead/up03.sql" + "../../../../../schema/crdb/crucible-agent-reservation-overhead/up04.sql" )) .execute_async(&conn) .await?; diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d68264c0e23..012b70559c5 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -4715,7 +4715,6 @@ mod tests { region.extent_count(), 111, false, // read-write - 1.0, ); use nexus_db_model::schema::region::dsl; @@ -4953,7 +4952,6 @@ mod tests { region.extent_count(), 111, true, // read-only - 1.0, ); use nexus_db_model::schema::region::dsl; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index d538d3a114c..4240990f92e 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -82,6 +82,78 @@ pub struct RegionParameters { pub read_only: bool, } +type AllocationQuery = + TypedSqlQuery<(SelectableSql, SelectableSql)>; + +impl std::fmt::Debug for AllocationQuery { + fn fmt( + &self, + f: &mut std::fmt::Formatter<'_>, + ) -> Result<(), std::fmt::Error> { + f.write_str("AllocationQuery") + } +} + +/// Currently the largest region that can be allocated matches the largest disk +/// that can be requested, but separate this constant so that when +/// MAX_DISK_SIZE_BYTES is increased the region allocation query will still use +/// this as a maximum size. +pub const MAX_REGION_SIZE_BYTES: u64 = 1098437885952; // 1023 * (1 << 30); + +#[derive(Debug)] +pub enum AllocationQueryError { + /// Region size multiplication overflowed u64 + RegionSizeOverflow, + + /// Requested region size larger than maximum + RequestedRegionOverMaxSize { request: u64, maximum: u64 }, + + /// Requested size not divisible by reservation factor + RequestedRegionNotDivisibleByFactor { request: i64, factor: i64 }, + + /// Adding the overhead to the requested size overflowed + RequestedRegionOverheadOverflow { request: i64, overhead: i64 }, +} + +impl From for external::Error { + fn from(e: AllocationQueryError) -> external::Error { + match e { + AllocationQueryError::RegionSizeOverflow => { + external::Error::invalid_value( + "region allocation", + "region size overflowed u64", + ) + } + + AllocationQueryError::RequestedRegionOverMaxSize { + request, + maximum, + } => external::Error::invalid_value( + "region allocation", + format!("region size {request} over maximum {maximum}"), + ), + + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request, + factor, + } => external::Error::invalid_value( + "region allocation", + format!("region size {request} not divisible by {factor}"), + ), + + AllocationQueryError::RequestedRegionOverheadOverflow { + request, + overhead, + } => external::Error::invalid_value( + "region allocation", + format!( + "adding {overhead} to region size {request} overflowed" + ), + ), + } + } +} + /// For a given volume, idempotently allocate enough regions (according to some /// allocation strategy) to meet some redundancy level. This should only be used /// for the region set that is in the top level of the Volume (not the deeper @@ -93,7 +165,7 @@ pub fn allocation_query( params: RegionParameters, allocation_strategy: &RegionAllocationStrategy, redundancy: usize, -) -> TypedSqlQuery<(SelectableSql, SelectableSql)> { +) -> Result { let (seed, distinct_sleds) = { let (input_seed, distinct_sleds) = match allocation_strategy { RegionAllocationStrategy::Random { seed } => (seed, false), @@ -117,16 +189,56 @@ pub fn allocation_query( let seed = seed.to_le_bytes().to_vec(); - // The Crucible Agent's current reservation factor - const RESERVATION_FACTOR: f64 = 1.25; + // Ensure that the multiplication doesn't overflow. + let requested_size: u64 = params + .block_size + .checked_mul(params.blocks_per_extent) + .ok_or(AllocationQueryError::RegionSizeOverflow)? + .checked_mul(params.extent_count) + .ok_or(AllocationQueryError::RegionSizeOverflow)?; + + if requested_size > MAX_REGION_SIZE_BYTES { + return Err(AllocationQueryError::RequestedRegionOverMaxSize { + request: requested_size, + maximum: MAX_REGION_SIZE_BYTES, + }); + } + + // After the above check, unconditionally cast from u64 to i64. The value is + // low enough that this shouldn't truncate. + let requested_size: i64 = requested_size.try_into().unwrap(); + + // The Crucible Agent's current reservation factor is 25%, so add that here. + // Check first that the requested region size is divisible by this. This + // should basically never fail because all block sizes are divisible by 4. + if requested_size % 4 != 0 { + return Err( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + ); + } - let requested_size: u64 = - params.block_size * params.blocks_per_extent * params.extent_count; - let size_delta: u64 = - (requested_size as f64 * RESERVATION_FACTOR).round() as u64; + let overhead: i64 = requested_size.checked_div(4).ok_or( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + )?; + + let size_delta: i64 = requested_size.checked_add(overhead).ok_or( + AllocationQueryError::RequestedRegionOverheadOverflow { + request: requested_size, + overhead, + }, + )?; let redundancy: i64 = i64::try_from(redundancy).unwrap(); + let reservation_percent = + crate::db::model::RegionReservationPercent::TwentyFive; + let mut builder = QueryBuilder::new(); builder.sql( @@ -223,7 +335,7 @@ pub fn allocation_query( AND physical_disk.disk_state = 'active' AND NOT(zpool.id = ANY(SELECT existing_zpools.pool_id FROM existing_zpools)) " - ).bind::(size_delta as i64); + ).bind::(size_delta); if distinct_sleds { builder @@ -279,7 +391,7 @@ pub fn allocation_query( NULL AS port, ").param().sql(" AS read_only, FALSE as deleting, - ").param().sql(" AS reservation_factor + ").param().sql(" AS reservation_percent FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -293,7 +405,7 @@ pub fn allocation_query( .bind::(params.blocks_per_extent as i64) .bind::(params.extent_count as i64) .bind::(params.read_only) - .bind::(RESERVATION_FACTOR) + .bind::(reservation_percent) .bind::(redundancy) // A subquery which summarizes the changes we intend to make, showing: @@ -306,19 +418,10 @@ pub fn allocation_query( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - CAST( - CAST( - candidate_regions.block_size * - candidate_regions.blocks_per_extent * - candidate_regions.extent_count - AS FLOAT - ) - * candidate_regions.reservation_factor - AS INT - ) - AS size_used_delta + ").param().sql(" AS size_used_delta FROM (candidate_regions INNER JOIN crucible_dataset ON (crucible_dataset.id = candidate_regions.dataset_id)) ),") + .bind::(size_delta) // Confirms whether or not the insertion and updates should // occur. @@ -403,7 +506,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting, reservation_factor) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting, reservation_percent) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE @@ -436,7 +539,7 @@ UNION )" ); - builder.query() + Ok(builder.query()) } #[cfg(test)] @@ -476,7 +579,8 @@ mod test { seed: Some(1), }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, @@ -492,7 +596,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: Some(1) }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_random_sleds.sql", @@ -513,7 +618,8 @@ mod test { seed: Some(1), }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_with_snapshot_distinct_sleds.sql", @@ -528,7 +634,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: Some(1) }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_with_snapshot_random_sleds.sql", @@ -561,7 +668,8 @@ mod test { params, &RegionAllocationStrategy::RandomWithDistinctSleds { seed: None }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); let _ = region_allocate .explain_async(&conn) .await @@ -575,7 +683,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); let _ = region_allocate .explain_async(&conn) .await @@ -584,4 +693,64 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn allocation_query_region_size_overflow() { + let volume_id = VolumeUuid::nil(); + let snapshot_id = None; + + let params = RegionParameters { + block_size: 512, + blocks_per_extent: 4294967296, + extent_count: 8388609, // should cause an overflow! + read_only: false, + }; + + let Err(e) = allocation_query( + volume_id, + snapshot_id, + params, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ) else { + panic!("expected error"); + }; + + assert!(matches!(e, AllocationQueryError::RegionSizeOverflow)); + } + + #[tokio::test] + async fn allocation_query_region_size_too_large() { + let volume_id = VolumeUuid::nil(); + let snapshot_id = None; + + let params = RegionParameters { + block_size: 512, + blocks_per_extent: 8388608, // 2^32 / 512 + extent_count: 256, // 255 would be ok, 256 is too large + read_only: false, + }; + + let Err(e) = allocation_query( + volume_id, + snapshot_id, + params, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ) else { + panic!("expected error!"); + }; + + assert!(matches!( + e, + AllocationQueryError::RequestedRegionOverMaxSize { + request: 1099511627776u64, + maximum: MAX_REGION_SIZE_BYTES, + } + )); + } } diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index faa29b5bb1b..51f56f89ac6 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -13,7 +13,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent FROM region WHERE @@ -115,7 +115,7 @@ WITH NULL AS port, $11 AS read_only, false AS deleting, - $12 AS reservation_factor + $12 AS reservation_percent FROM shuffled_candidate_datasets LIMIT @@ -126,17 +126,7 @@ WITH SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - CAST( - CAST( - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS FLOAT8 - ) - * candidate_regions.reservation_factor - AS INT8 - ) - AS size_used_delta + $14 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -146,7 +136,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $14 + (SELECT count(*) FROM old_regions LIMIT 1) < $15 AND CAST( IF( ( @@ -156,7 +146,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough space' @@ -173,7 +163,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough datasets' @@ -212,7 +202,7 @@ WITH 1 ) ) - >= $17 + >= $18 ), 'TRUE', 'Not enough unique zpools selected' @@ -238,7 +228,7 @@ WITH port, read_only, deleting, - reservation_factor + reservation_percent ) SELECT candidate_regions.id, @@ -252,7 +242,7 @@ WITH candidate_regions.port, candidate_regions.read_only, candidate_regions.deleting, - candidate_regions.reservation_factor + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -269,7 +259,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent ), updated_datasets AS ( @@ -324,7 +314,7 @@ WITH old_regions.port, old_regions.read_only, old_regions.deleting, - old_regions.reservation_factor + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -351,7 +341,7 @@ UNION inserted_regions.port, inserted_regions.read_only, inserted_regions.deleting, - inserted_regions.reservation_factor + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index ac7b83610a9..7e1ac153a3f 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -13,7 +13,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent FROM region WHERE @@ -103,7 +103,7 @@ WITH NULL AS port, $10 AS read_only, false AS deleting, - $11 AS reservation_factor + $11 AS reservation_percent FROM shuffled_candidate_datasets LIMIT @@ -114,17 +114,7 @@ WITH SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - CAST( - CAST( - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS FLOAT8 - ) - * candidate_regions.reservation_factor - AS INT8 - ) - AS size_used_delta + $13 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -134,7 +124,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $13 + (SELECT count(*) FROM old_regions LIMIT 1) < $14 AND CAST( IF( ( @@ -144,7 +134,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $14 + >= $15 ), 'TRUE', 'Not enough space' @@ -161,7 +151,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough datasets' @@ -200,7 +190,7 @@ WITH 1 ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough unique zpools selected' @@ -226,7 +216,7 @@ WITH port, read_only, deleting, - reservation_factor + reservation_percent ) SELECT candidate_regions.id, @@ -240,7 +230,7 @@ WITH candidate_regions.port, candidate_regions.read_only, candidate_regions.deleting, - candidate_regions.reservation_factor + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -257,7 +247,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent ), updated_datasets AS ( @@ -312,7 +302,7 @@ WITH old_regions.port, old_regions.read_only, old_regions.deleting, - old_regions.reservation_factor + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -339,7 +329,7 @@ UNION inserted_regions.port, inserted_regions.read_only, inserted_regions.deleting, - inserted_regions.reservation_factor + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 1364d8fca6a..03b2b6cb708 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -13,7 +13,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent FROM region WHERE @@ -127,7 +127,7 @@ WITH NULL AS port, $12 AS read_only, false AS deleting, - $13 AS reservation_factor + $13 AS reservation_percent FROM shuffled_candidate_datasets LIMIT @@ -138,17 +138,7 @@ WITH SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - CAST( - CAST( - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS FLOAT8 - ) - * candidate_regions.reservation_factor - AS INT8 - ) - AS size_used_delta + $15 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -158,7 +148,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $15 + (SELECT count(*) FROM old_regions LIMIT 1) < $16 AND CAST( IF( ( @@ -168,7 +158,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough space' @@ -185,7 +175,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $17 + >= $18 ), 'TRUE', 'Not enough datasets' @@ -224,7 +214,7 @@ WITH 1 ) ) - >= $18 + >= $19 ), 'TRUE', 'Not enough unique zpools selected' @@ -250,7 +240,7 @@ WITH port, read_only, deleting, - reservation_factor + reservation_percent ) SELECT candidate_regions.id, @@ -264,7 +254,7 @@ WITH candidate_regions.port, candidate_regions.read_only, candidate_regions.deleting, - candidate_regions.reservation_factor + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -281,7 +271,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent ), updated_datasets AS ( @@ -336,7 +326,7 @@ WITH old_regions.port, old_regions.read_only, old_regions.deleting, - old_regions.reservation_factor + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -363,7 +353,7 @@ UNION inserted_regions.port, inserted_regions.read_only, inserted_regions.deleting, - inserted_regions.reservation_factor + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index 9d4e390809c..b71578e0509 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -13,7 +13,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent FROM region WHERE @@ -115,7 +115,7 @@ WITH NULL AS port, $11 AS read_only, false AS deleting, - $12 AS reservation_factor + $12 AS reservation_percent FROM shuffled_candidate_datasets LIMIT @@ -126,17 +126,7 @@ WITH SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - CAST( - CAST( - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS FLOAT8 - ) - * candidate_regions.reservation_factor - AS INT8 - ) - AS size_used_delta + $14 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -146,7 +136,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $14 + (SELECT count(*) FROM old_regions LIMIT 1) < $15 AND CAST( IF( ( @@ -156,7 +146,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $15 + >= $16 ), 'TRUE', 'Not enough space' @@ -173,7 +163,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $16 + >= $17 ), 'TRUE', 'Not enough datasets' @@ -212,7 +202,7 @@ WITH 1 ) ) - >= $17 + >= $18 ), 'TRUE', 'Not enough unique zpools selected' @@ -238,7 +228,7 @@ WITH port, read_only, deleting, - reservation_factor + reservation_percent ) SELECT candidate_regions.id, @@ -252,7 +242,7 @@ WITH candidate_regions.port, candidate_regions.read_only, candidate_regions.deleting, - candidate_regions.reservation_factor + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -269,7 +259,7 @@ WITH region.port, region.read_only, region.deleting, - region.reservation_factor + region.reservation_percent ), updated_datasets AS ( @@ -324,7 +314,7 @@ WITH old_regions.port, old_regions.read_only, old_regions.deleting, - old_regions.reservation_factor + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -351,7 +341,7 @@ UNION inserted_regions.port, inserted_regions.read_only, inserted_regions.deleting, - inserted_regions.reservation_factor + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs index cc97f1b7cf6..941ad553392 100644 --- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs +++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs @@ -164,7 +164,6 @@ mod test { 10, 1, false, - 1.0, ) }; let conn = datastore.pool_connection_for_tests().await.unwrap(); diff --git a/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs index 20ff9326dc9..b362f467b11 100644 --- a/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs +++ b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs @@ -256,7 +256,6 @@ mod tests { 10, 1, false, - 1.0, ) }; let region_id = region.id(); diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index 3ff7c1c9868..6586d214b18 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -362,7 +362,6 @@ mod test { 10, 27015, false, - 1.0, ) }; @@ -377,7 +376,6 @@ mod test { 10, 27016, false, - 1.0, ) }; @@ -476,7 +474,6 @@ mod test { 10, 27015, false, - 1.0, ) }; @@ -491,7 +488,6 @@ mod test { 10, 27016, false, - 1.0, ) }; @@ -634,7 +630,6 @@ mod test { 10, 27015, false, - 1.0, ) }; @@ -649,7 +644,6 @@ mod test { 10, 27016, false, - 1.0, ) }; diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index 65bfe3114fe..47d88159428 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -253,7 +253,6 @@ pub(crate) mod test { 10, 12345, false, - 1.0, ) }; diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index 7b17c8cd85d..b1fe3c5a27b 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -926,7 +926,6 @@ pub(crate) mod test { 10, 1001, false, - 1.0, ), Region::new( datasets[1].id(), @@ -936,7 +935,6 @@ pub(crate) mod test { 10, 1002, false, - 1.0, ), Region::new( datasets[2].id(), @@ -946,7 +944,6 @@ pub(crate) mod test { 10, 1003, false, - 1.0, ), Region::new( datasets[3].id(), @@ -956,7 +953,6 @@ pub(crate) mod test { 10, 1004, false, - 1.0, ), ]; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up01.sql b/schema/crdb/crucible-agent-reservation-overhead/up01.sql index 5fbc44f7a26..4bc50409f8e 100644 --- a/schema/crdb/crucible-agent-reservation-overhead/up01.sql +++ b/schema/crdb/crucible-agent-reservation-overhead/up01.sql @@ -1,2 +1,3 @@ -ALTER TABLE omicron.public.region - ADD COLUMN IF NOT EXISTS reservation_factor FLOAT NOT NULL DEFAULT 1.25; +CREATE TYPE IF NOT EXISTS omicron.public.region_reservation_percent AS ENUM ( + '25' +); diff --git a/schema/crdb/crucible-agent-reservation-overhead/up02.sql b/schema/crdb/crucible-agent-reservation-overhead/up02.sql index a86a7cd4e00..d498edf0525 100644 --- a/schema/crdb/crucible-agent-reservation-overhead/up02.sql +++ b/schema/crdb/crucible-agent-reservation-overhead/up02.sql @@ -1,2 +1,2 @@ ALTER TABLE omicron.public.region - ALTER COLUMN reservation_factor DROP DEFAULT; + ADD COLUMN IF NOT EXISTS reservation_percent omicron.public.region_reservation_percent NOT NULL DEFAULT '25'; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up03.sql b/schema/crdb/crucible-agent-reservation-overhead/up03.sql index fb52776a5fe..c7c13a6dbed 100644 --- a/schema/crdb/crucible-agent-reservation-overhead/up03.sql +++ b/schema/crdb/crucible-agent-reservation-overhead/up03.sql @@ -1,23 +1,2 @@ -WITH size_used_with_reservation AS ( - SELECT - crucible_dataset.id AS crucible_dataset_id, - SUM( - CASE - WHEN block_size IS NULL THEN 0 - ELSE - CAST( - CAST(block_size * blocks_per_extent * extent_count AS FLOAT) - * reservation_factor - AS INT - ) - END - ) AS reserved_size - FROM crucible_dataset - LEFT JOIN region ON crucible_dataset.id = region.dataset_id - WHERE crucible_dataset.time_deleted IS NULL - GROUP BY crucible_dataset.id -) -UPDATE crucible_dataset -SET size_used = size_used_with_reservation.reserved_size -FROM size_used_with_reservation -WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id; +ALTER TABLE omicron.public.region + ALTER COLUMN reservation_percent DROP DEFAULT; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up04.sql b/schema/crdb/crucible-agent-reservation-overhead/up04.sql new file mode 100644 index 00000000000..cfd0ff7d0d1 --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up04.sql @@ -0,0 +1,23 @@ +WITH size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CASE + WHEN reservation_percent = '25' THEN + (block_size * blocks_per_extent * extent_count) / 4 + + (block_size * blocks_per_extent * extent_count) + END + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE crucible_dataset.time_deleted IS NULL + GROUP BY crucible_dataset.id +) +UPDATE crucible_dataset +SET size_used = size_used_with_reservation.reserved_size +FROM size_used_with_reservation +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9eaeaf872e2..0d1846627a6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -615,6 +615,10 @@ CREATE INDEX IF NOT EXISTS lookup_crucible_dataset_by_zpool ON CREATE INDEX IF NOT EXISTS lookup_crucible_dataset_by_ip ON omicron.public.crucible_dataset (ip); +CREATE TYPE IF NOT EXISTS omicron.public.region_reservation_percent AS ENUM ( + '25' +); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -641,7 +645,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( deleting BOOL NOT NULL, - reservation_factor FLOAT NOT NULL + reservation_percent omicron.public.region_reservation_percent NOT NULL ); /* From 4834bd9dd77caa1d2ff84625a55e6c5b1ee22a90 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 28 Mar 2025 20:43:13 +0000 Subject: [PATCH 03/12] embed size_used update CTE it will have to change independent of the schema update anyway --- nexus/db-queries/src/db/datastore/region.rs | 33 +++++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index d037c1cbecb..f05fa549841 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -315,12 +315,33 @@ impl DataStore { .await?; // Update datasets to which the regions belonged. - // XXX put this file somewhere else - sql_query(include_str!( - "../../../../../schema/crdb/crucible-agent-reservation-overhead/up04.sql" - )) - .execute_async(&conn) - .await?; + sql_query(r#" +WITH size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CASE + WHEN reservation_percent = '25' THEN + (block_size * blocks_per_extent * extent_count) / 4 + + (block_size * blocks_per_extent * extent_count) + END + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE crucible_dataset.time_deleted IS NULL + GROUP BY crucible_dataset.id +) +UPDATE crucible_dataset +SET size_used = size_used_with_reservation.reserved_size +FROM size_used_with_reservation +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"# + ) + .execute_async(&conn) + .await?; // Whenever a region is hard-deleted, validate invariants // for all volumes From f0313050183ffe556887af065828889ffbd893aa Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 28 Mar 2025 20:46:09 +0000 Subject: [PATCH 04/12] fmt --- nexus/db-queries/src/db/datastore/region.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index f05fa549841..01ddaf35ea8 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -306,7 +306,7 @@ impl DataStore { .transaction(&conn, |conn| { let region_ids = region_ids.clone(); async move { - use db::schema::region::dsl as dsl; + use db::schema::region::dsl; // Remove the regions diesel::delete(dsl::region) @@ -315,7 +315,8 @@ impl DataStore { .await?; // Update datasets to which the regions belonged. - sql_query(r#" + sql_query( + r#" WITH size_used_with_reservation AS ( SELECT crucible_dataset.id AS crucible_dataset_id, @@ -338,10 +339,10 @@ WITH size_used_with_reservation AS ( UPDATE crucible_dataset SET size_used = size_used_with_reservation.reserved_size FROM size_used_with_reservation -WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"# - ) - .execute_async(&conn) - .await?; +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"#, + ) + .execute_async(&conn) + .await?; // Whenever a region is hard-deleted, validate invariants // for all volumes From 13f2a0797a93d8667bb11e40ee25fba26b569f5c Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 28 Mar 2025 20:48:51 +0000 Subject: [PATCH 05/12] more docs is always better --- schema/crdb/dbinit.sql | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0d1846627a6..03dc9363ca1 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -598,6 +598,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.crucible_dataset ( * Reconfigurator rendezvous process, this field is set to 0. Reconfigurator * otherwise ignores this field. It's updated by Nexus as region allocations * and deletions are performed using this dataset. + * + * Note that this is the size *reserved* by the Crucible agent for regions, + * not the actual region size. */ size_used INT NOT NULL ); @@ -645,6 +648,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( deleting BOOL NOT NULL, + /* + * The Crucible Agent will reserve space for a region with overhead for + * on-disk metadata that the downstairs needs to store. Record here the + * overhead associated with a specific region as this may change or be + * configurable in the future. + */ reservation_percent omicron.public.region_reservation_percent NOT NULL ); From f7dbdf8c281532e6b155319e0aa1ac0ad042d14b Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 1 Apr 2025 16:50:47 +0000 Subject: [PATCH 06/12] fix after merge --- nexus/db-model/src/region.rs | 6 +----- nexus/db-queries/src/db/queries/region_allocation.rs | 2 +- nexus/db-schema/src/enums.rs | 1 + nexus/db-schema/src/schema.rs | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index 2b8f9f30985..821b7182c18 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -5,7 +5,6 @@ use super::ByteCount; use crate::SqlU16; use crate::impl_enum_type; -use crate::schema::region; use crate::typed_uuid::DbTypedUuid; use db_macros::Asset; use nexus_db_schema::schema::region; @@ -18,12 +17,9 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; impl_enum_type!( - #[derive(SqlType, Debug, QueryId)] - #[diesel(postgres_type(name = "region_reservation_percent", schema = "public"))] - pub struct RegionReservationPercentEnum; + RegionReservationPercentEnum: #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] - #[diesel(sql_type = RegionReservationPercentEnum)] pub enum RegionReservationPercent; // Enum values diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index a6abb4801e3..674188a51eb 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -405,7 +405,7 @@ pub fn allocation_query( .bind::(params.blocks_per_extent as i64) .bind::(params.extent_count as i64) .bind::(params.read_only) - .bind::(reservation_percent) + .bind::(reservation_percent) .bind::(redundancy) // A subquery which summarizes the changes we intend to make, showing: diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 31e70dfd2e2..c529ef824fb 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -66,6 +66,7 @@ define_enums! { ReadOnlyTargetReplacementTypeEnum => "read_only_target_replacement_type", RegionReplacementStateEnum => "region_replacement_state", RegionReplacementStepTypeEnum => "region_replacement_step_type", + RegionReservationPercentEnum => "region_reservation_percent", RegionSnapshotReplacementStateEnum => "region_snapshot_replacement_state", RegionSnapshotReplacementStepStateEnum => "region_snapshot_replacement_step_state", RotImageErrorEnum => "rot_image_error", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 56e6150f077..737b7d5665b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1128,7 +1128,7 @@ table! { deleting -> Bool, - reservation_percent -> crate::RegionReservationPercentEnum, + reservation_percent -> crate::enums::RegionReservationPercentEnum, } } From 84c9d7e622df72cbde7b6ebf77889ed2caea0eb3 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 2 Apr 2025 15:50:11 +0000 Subject: [PATCH 07/12] better comment for size_used --- schema/crdb/dbinit.sql | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 25df09b9cf9..b71842b65ef 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -599,8 +599,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.crucible_dataset ( * otherwise ignores this field. It's updated by Nexus as region allocations * and deletions are performed using this dataset. * - * Note that this is the size *reserved* by the Crucible agent for regions, - * not the actual region size. + * Note that the value in this column is _not_ the sum of requested region + * sizes, but sum of the size *reserved* by the Crucible agent for the + * dataset that contains the regions (which is larger than the the actual + * region size). */ size_used INT NOT NULL ); From dd6ae8104a55159656204e30f579e11a781b3d9f Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 3 Apr 2025 18:31:36 +0000 Subject: [PATCH 08/12] impl not necessary --- nexus/db-queries/src/db/queries/region_allocation.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 674188a51eb..d111632a0c2 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -85,15 +85,6 @@ pub struct RegionParameters { type AllocationQuery = TypedSqlQuery<(SelectableSql, SelectableSql)>; -impl std::fmt::Debug for AllocationQuery { - fn fmt( - &self, - f: &mut std::fmt::Formatter<'_>, - ) -> Result<(), std::fmt::Error> { - f.write_str("AllocationQuery") - } -} - /// Currently the largest region that can be allocated matches the largest disk /// that can be requested, but separate this constant so that when /// MAX_DISK_SIZE_BYTES is increased the region allocation query will still use From 1e478881e6930a04cbcba9ce0c6bc533dae14cf2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 3 Apr 2025 18:32:04 +0000 Subject: [PATCH 09/12] use match to make adding variants a compile error --- .../src/db/queries/region_allocation.rs | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index d111632a0c2..c0aa0f65169 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -5,7 +5,7 @@ //! Implementation of queries for provisioning regions. use crate::db::column_walker::AllColumnsOf; -use crate::db::model::{CrucibleDataset, Region}; +use crate::db::model::{CrucibleDataset, Region, RegionReservationPercent}; use crate::db::raw_query_builder::{QueryBuilder, TypedSqlQuery}; use crate::db::true_or_cast_error::matches_sentinel; use const_format::concatcp; @@ -13,6 +13,7 @@ use diesel::pg::Pg; use diesel::result::Error as DieselError; use diesel::sql_types; use nexus_config::RegionAllocationStrategy; +use nexus_db_schema::enums::RegionReservationPercentEnum; use nexus_db_schema::schema; use omicron_common::api::external; use omicron_uuid_kinds::GenericUuid; @@ -199,37 +200,40 @@ pub fn allocation_query( // low enough that this shouldn't truncate. let requested_size: i64 = requested_size.try_into().unwrap(); - // The Crucible Agent's current reservation factor is 25%, so add that here. - // Check first that the requested region size is divisible by this. This - // should basically never fail because all block sizes are divisible by 4. - if requested_size % 4 != 0 { - return Err( - AllocationQueryError::RequestedRegionNotDivisibleByFactor { - request: requested_size, - factor: 4, - }, - ); - } + let reservation_percent = RegionReservationPercent::TwentyFive; + + let size_delta: i64 = match reservation_percent { + RegionReservationPercent::TwentyFive => { + // Check first that the requested region size is divisible by this. + // This should basically never fail because all block sizes are + // divisible by 4. + if requested_size % 4 != 0 { + return Err( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + ); + } - let overhead: i64 = requested_size.checked_div(4).ok_or( - AllocationQueryError::RequestedRegionNotDivisibleByFactor { - request: requested_size, - factor: 4, - }, - )?; + let overhead: i64 = requested_size.checked_div(4).ok_or( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + )?; - let size_delta: i64 = requested_size.checked_add(overhead).ok_or( - AllocationQueryError::RequestedRegionOverheadOverflow { - request: requested_size, - overhead, - }, - )?; + requested_size.checked_add(overhead).ok_or( + AllocationQueryError::RequestedRegionOverheadOverflow { + request: requested_size, + overhead, + }, + )? + } + }; let redundancy: i64 = i64::try_from(redundancy).unwrap(); - let reservation_percent = - crate::db::model::RegionReservationPercent::TwentyFive; - let mut builder = QueryBuilder::new(); builder.sql( @@ -396,7 +400,7 @@ pub fn allocation_query( .bind::(params.blocks_per_extent as i64) .bind::(params.extent_count as i64) .bind::(params.read_only) - .bind::(reservation_percent) + .bind::(reservation_percent) .bind::(redundancy) // A subquery which summarizes the changes we intend to make, showing: From be6599b3ddb766d2916747c7f3b656c2e2403647 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 3 Apr 2025 18:32:38 +0000 Subject: [PATCH 10/12] async not required --- nexus/db-queries/src/db/queries/region_allocation.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index c0aa0f65169..d8805439615 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -689,8 +689,8 @@ mod test { logctx.cleanup_successful(); } - #[tokio::test] - async fn allocation_query_region_size_overflow() { + #[test] + fn allocation_query_region_size_overflow() { let volume_id = VolumeUuid::nil(); let snapshot_id = None; @@ -716,8 +716,8 @@ mod test { assert!(matches!(e, AllocationQueryError::RegionSizeOverflow)); } - #[tokio::test] - async fn allocation_query_region_size_too_large() { + #[test] + fn allocation_query_region_size_too_large() { let volume_id = VolumeUuid::nil(); let snapshot_id = None; From f8b9fa8579ccbdf20361af28a20c6a4591b2be41 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 3 Apr 2025 18:36:14 +0000 Subject: [PATCH 11/12] fmt --- nexus/db-queries/src/db/queries/region_allocation.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index d8805439615..5db13335ef8 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -50,7 +50,11 @@ pub fn from_diesel(e: DieselError) -> external::Error { NOT_ENOUGH_ZPOOL_SPACE_SENTINEL => { return external::Error::insufficient_capacity( external_message, - "Not enough zpool space to allocate disks. There may not be enough disks with space for the requested region. You may also see this if your rack is in a degraded state, or you're running the default multi-rack topology configuration in a 1-sled development environment.", + "Not enough zpool space to allocate disks. There may not \ + be enough disks with space for the requested region. You \ + may also see this if your rack is in a degraded state, or \ + you're running the default multi-rack topology \ + configuration in a 1-sled development environment.", ); } NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL => { From 7d9fed61dda7a4fac4a253ed11f599a0fd0a11f9 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 3 Apr 2025 19:03:15 +0000 Subject: [PATCH 12/12] no unwrap, return err instead --- .../src/db/queries/region_allocation.rs | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 5db13335ef8..268af89b58e 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -109,6 +109,9 @@ pub enum AllocationQueryError { /// Adding the overhead to the requested size overflowed RequestedRegionOverheadOverflow { request: i64, overhead: i64 }, + + /// Converting from u64 to i64 truncated + RequestedRegionSizeTruncated { request: u64, e: String }, } impl From for external::Error { @@ -146,6 +149,13 @@ impl From for external::Error { "adding {overhead} to region size {request} overflowed" ), ), + + AllocationQueryError::RequestedRegionSizeTruncated { + request, + e, + } => external::Error::internal_error(&format!( + "converting {request} to i64 failed! {e}" + )), } } } @@ -200,9 +210,17 @@ pub fn allocation_query( }); } - // After the above check, unconditionally cast from u64 to i64. The value is - // low enough that this shouldn't truncate. - let requested_size: i64 = requested_size.try_into().unwrap(); + // After the above check, cast from u64 to i64. The value is low enough + // (after the check above) that try_into should always return Ok. + let requested_size: i64 = match requested_size.try_into() { + Ok(v) => v, + Err(e) => { + return Err(AllocationQueryError::RequestedRegionSizeTruncated { + request: requested_size, + e: e.to_string(), + }); + } + }; let reservation_percent = RegionReservationPercent::TwentyFive;