From 7d3ad9f37ed6b4907f16ded577c5d96adffa6dca Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 10 Apr 2025 20:21:06 +0000 Subject: [PATCH 1/3] Reduce regions_hard_delete contention Recently, regions_hard_delete changed to use a CTE to update the size_used column of the crucible_dataset table. Unfortunately this had the side effect of causing a massive amount of contention: the CTE would 1) delete rows from the regions table 2) read from the regions table during the CTE 3) update the size_used column for _all_ crucible_dataset rows This would almost certainly cause each invocation of the CTE to contend with each other, as seen when doing disk deletes in parallel. This commit changes the CTE to: 1) delete rows from the regions table, returning the affected datasets 2) read from the regions table during the CTE 3) update the size_used column for affected crucible_dataset rows only. This was tested by using terraform to create and tear down 90 disks, with a parallelism setting of 10, 20, and 30. Before this change, this would not work as Nexus would inevitably return 500s. Fixes #7952 --- nexus/db-queries/src/db/datastore/region.rs | 56 +++++------------ nexus/db-queries/src/db/queries/mod.rs | 1 + .../src/db/queries/regions_hard_delete.rs | 60 +++++++++++++++++++ 3 files changed, 77 insertions(+), 40 deletions(-) create mode 100644 nexus/db-queries/src/db/queries/regions_hard_delete.rs diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 1596588ff8c..776eff5190f 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -20,11 +20,11 @@ use crate::db::model::SqlU16; use crate::db::model::to_db_typed_uuid; use crate::db::pagination::Paginator; use crate::db::pagination::paginated; -use crate::db::queries::region_allocation::RegionParameters; +use crate::db::queries::region_allocation; +use crate::db::queries::regions_hard_delete; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; -use diesel::dsl::sql_query; use diesel::prelude::*; use nexus_config::RegionAllocationStrategy; use nexus_types::external_api::params; @@ -258,10 +258,10 @@ impl DataStore { } => (block_size, blocks_per_extent, extent_count), }; - let query = crate::db::queries::region_allocation::allocation_query( + let query = region_allocation::allocation_query( volume_id, maybe_snapshot_id, - RegionParameters { + region_allocation::RegionParameters { block_size, blocks_per_extent, extent_count, @@ -273,10 +273,10 @@ impl DataStore { let conn = self.pool_connection_authorized(&opctx).await?; - let dataset_and_regions: Vec<(CrucibleDataset, Region)> = - query.get_results_async(&*conn).await.map_err(|e| { - crate::db::queries::region_allocation::from_diesel(e) - })?; + let dataset_and_regions: Vec<(CrucibleDataset, Region)> = query + .get_results_async(&*conn) + .await + .map_err(|e| region_allocation::from_diesel(e))?; info!( self.log, @@ -302,47 +302,23 @@ impl DataStore { } let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("regions_hard_delete") .transaction(&conn, |conn| { let region_ids = region_ids.clone(); + async move { use nexus_db_schema::schema::region::dsl; - // Remove the regions - diesel::delete(dsl::region) + let dataset_ids: Vec = diesel::delete(dsl::region) .filter(dsl::id.eq_any(region_ids)) - .execute_async(&conn) + .returning(dsl::dataset_id) + .get_results_async(&conn) .await?; - // Update datasets to which the regions belonged. - sql_query( - r#" -WITH size_used_with_reservation AS ( - SELECT - crucible_dataset.id AS crucible_dataset_id, - SUM( - CASE - WHEN block_size IS NULL THEN 0 - ELSE - CASE - WHEN reservation_percent = '25' THEN - (block_size * blocks_per_extent * extent_count) / 4 + - (block_size * blocks_per_extent * extent_count) - END - END - ) AS reserved_size - FROM crucible_dataset - LEFT JOIN region ON crucible_dataset.id = region.dataset_id - WHERE crucible_dataset.time_deleted IS NULL - GROUP BY crucible_dataset.id -) -UPDATE crucible_dataset -SET size_used = size_used_with_reservation.reserved_size -FROM size_used_with_reservation -WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"#, - ) - .execute_async(&conn) - .await?; + let query = + regions_hard_delete::dataset_update_query(dataset_ids); + query.execute_async(&conn).await?; // Whenever a region is hard-deleted, validate invariants // for all volumes diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 8d55c3e21f5..78e4dc55955 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -13,6 +13,7 @@ mod next_item; pub mod network_interface; pub mod oximeter; pub mod region_allocation; +pub mod regions_hard_delete; pub mod sled_reservation; pub mod virtual_provisioning_collection_update; pub mod vpc; diff --git a/nexus/db-queries/src/db/queries/regions_hard_delete.rs b/nexus/db-queries/src/db/queries/regions_hard_delete.rs new file mode 100644 index 00000000000..f9326569de1 --- /dev/null +++ b/nexus/db-queries/src/db/queries/regions_hard_delete.rs @@ -0,0 +1,60 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of query to update crucible_dataset size_used after +//! hard-deleting regions + +use crate::db::datastore::RunnableQueryNoReturn; +use crate::db::raw_query_builder::QueryBuilder; +use diesel::sql_types; +use uuid::Uuid; + +/// Update the affected Crucible dataset rows after hard-deleting regions +pub fn dataset_update_query( + dataset_ids: Vec, +) -> impl RunnableQueryNoReturn { + let mut builder = QueryBuilder::new(); + + builder.sql( + "WITH + size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CASE + WHEN reservation_percent = '25' THEN + (block_size * blocks_per_extent * extent_count) / 4 + + (block_size * blocks_per_extent * extent_count) + END + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE + crucible_dataset.time_deleted IS NULL AND + crucible_dataset.id IN (", + ); + + for (idx, dataset_id) in dataset_ids.into_iter().enumerate() { + if idx != 0 { + builder.sql(","); + } + builder.param().bind::(dataset_id); + } + + builder.sql( + ") + GROUP BY crucible_dataset.id + ) + UPDATE crucible_dataset + SET size_used = size_used_with_reservation.reserved_size + FROM size_used_with_reservation + WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id", + ); + + builder.query::<()>() +} From 55226037adbb5e2ad5806d026812a744baaab733 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 11 Apr 2025 01:08:50 +0000 Subject: [PATCH 2/3] expectorate and explain tests --- .../src/db/queries/regions_hard_delete.rs | 46 +++++++++++++++++++ .../tests/output/dataset_update_query.sql | 31 +++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 nexus/db-queries/tests/output/dataset_update_query.sql diff --git a/nexus/db-queries/src/db/queries/regions_hard_delete.rs b/nexus/db-queries/src/db/queries/regions_hard_delete.rs index f9326569de1..d6001f1277e 100644 --- a/nexus/db-queries/src/db/queries/regions_hard_delete.rs +++ b/nexus/db-queries/src/db/queries/regions_hard_delete.rs @@ -58,3 +58,49 @@ pub fn dataset_update_query( builder.query::<()>() } + +#[cfg(test)] +mod test { + use super::*; + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + use crate::db::raw_query_builder::expectorate_query_contents; + use omicron_test_utils::dev; + use uuid::Uuid; + + // This test is a bit of a "change detector", but it's here to help with + // debugging too. If you change this query, it can be useful to see exactly + // how the output SQL has been altered. + #[tokio::test] + async fn expectorate_query() { + let query = + dataset_update_query(vec![Uuid::nil(), Uuid::nil(), Uuid::nil()]); + + expectorate_query_contents( + &query, + "tests/output/dataset_update_query.sql", + ) + .await; + } + + // Explain the possible forms of the SQL query to ensure that it + // creates a valid SQL string. + #[tokio::test] + async fn explainable() { + let logctx = dev::test_setup_log("explainable"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = + dataset_update_query(vec![Uuid::nil(), Uuid::nil(), Uuid::nil()]); + + let _ = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/tests/output/dataset_update_query.sql b/nexus/db-queries/tests/output/dataset_update_query.sql new file mode 100644 index 00000000000..cee5a665e65 --- /dev/null +++ b/nexus/db-queries/tests/output/dataset_update_query.sql @@ -0,0 +1,31 @@ +WITH + size_used_with_reservation + AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + sum( + CASE + WHEN block_size IS NULL THEN 0 + ELSE CASE + WHEN reservation_percent = '25' + THEN block_size * blocks_per_extent * extent_count / 4 + + block_size * blocks_per_extent * extent_count + END + END + ) + AS reserved_size + FROM + crucible_dataset LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE + crucible_dataset.time_deleted IS NULL AND crucible_dataset.id IN ($1, $2, $3) + GROUP BY + crucible_dataset.id + ) +UPDATE + crucible_dataset +SET + size_used = size_used_with_reservation.reserved_size +FROM + size_used_with_reservation +WHERE + crucible_dataset.id = size_used_with_reservation.crucible_dataset_id From fc373582522baac18c5fcc077721713217614899 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 11 Apr 2025 18:14:37 +0000 Subject: [PATCH 3/3] bind to array, not individual --- nexus/db-queries/src/db/queries/regions_hard_delete.rs | 9 ++------- nexus/db-queries/tests/output/dataset_update_query.sql | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/queries/regions_hard_delete.rs b/nexus/db-queries/src/db/queries/regions_hard_delete.rs index d6001f1277e..51c0666eb96 100644 --- a/nexus/db-queries/src/db/queries/regions_hard_delete.rs +++ b/nexus/db-queries/src/db/queries/regions_hard_delete.rs @@ -36,15 +36,10 @@ pub fn dataset_update_query( LEFT JOIN region ON crucible_dataset.id = region.dataset_id WHERE crucible_dataset.time_deleted IS NULL AND - crucible_dataset.id IN (", + crucible_dataset.id = ANY (", ); - for (idx, dataset_id) in dataset_ids.into_iter().enumerate() { - if idx != 0 { - builder.sql(","); - } - builder.param().bind::(dataset_id); - } + builder.param().bind::, _>(dataset_ids); builder.sql( ") diff --git a/nexus/db-queries/tests/output/dataset_update_query.sql b/nexus/db-queries/tests/output/dataset_update_query.sql index cee5a665e65..8cbcd662467 100644 --- a/nexus/db-queries/tests/output/dataset_update_query.sql +++ b/nexus/db-queries/tests/output/dataset_update_query.sql @@ -17,7 +17,7 @@ WITH FROM crucible_dataset LEFT JOIN region ON crucible_dataset.id = region.dataset_id WHERE - crucible_dataset.time_deleted IS NULL AND crucible_dataset.id IN ($1, $2, $3) + crucible_dataset.time_deleted IS NULL AND crucible_dataset.id = ANY ($1) GROUP BY crucible_dataset.id )