From de596b3b1f84f0d4ba56d6c4b7b66319333e39a0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 8 Mar 2024 19:05:38 +0000 Subject: [PATCH] throttling: exclude throttled time from basebackup (fixup of #6953) PR #6953 only excluded throttled time from the handle_pagerequests (aka smgr metrics). This PR implements the deduction for `basebackup ` queries. The other page_service methods either don't use Timeline::get or they aren't used in production. --- libs/metrics/src/lib.rs | 1 - libs/metrics/src/metric_vec_duration.rs | 23 --------- pageserver/src/metrics.rs | 63 +++++++++++++++++++++++-- pageserver/src/page_service.rs | 50 ++++++++++---------- 4 files changed, 83 insertions(+), 54 deletions(-) delete mode 100644 libs/metrics/src/metric_vec_duration.rs diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 744fc18e61df..22b0a18933f5 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -29,7 +29,6 @@ pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub mod metric_vec_duration; pub use hll::{HyperLogLog, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs deleted file mode 100644 index e9a0a655706f..000000000000 --- a/libs/metrics/src/metric_vec_duration.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. - -use std::{future::Future, time::Instant}; - -pub trait DurationResultObserver { - fn observe_result(&self, res: &Result, duration: std::time::Duration); -} - -pub async fn observe_async_block_duration_by_result< - T, - E, - F: Future>, - O: DurationResultObserver, ->( - observer: &O, - block: F, -) -> Result { - let start = Instant::now(); - let result = block.await; - let duration = start.elapsed(); - observer.observe_result(&result, duration); - result -} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ee62ee0367e8..27e754e9990a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,5 +1,4 @@ use enum_map::EnumMap; -use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| }) }); -impl DurationResultObserver for BasebackupQueryTime { - fn observe_result(&self, res: &Result, duration: std::time::Duration) { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { + parent: &'a BasebackupQueryTime, + ctx: &'c RequestContext, + start: std::time::Instant, +} + +impl BasebackupQueryTime { + pub(crate) fn start_recording<'c: 'a, 'a>( + &'a self, + ctx: &'c RequestContext, + ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + BasebackupQueryTimeOngoingRecording { + parent: self, + ctx, + start, + } + } +} + +impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { + pub(crate) fn observe(self, res: &Result) { + let elapsed = self.start.elapsed(); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(ex_throttled) => ex_throttled, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); - metric.observe(duration.as_secs_f64()); + let metric = self + .parent + .0 + .get_metric_with_label_values(&[label_value]) + .unwrap(); + metric.observe(ex_throttled.as_secs_f64()); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index dacee41e6ecb..f3ceb7d3e6e5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1199,7 +1199,7 @@ impl PageServerHandler { prev_lsn: Option, full_backup: bool, gzip: bool, - ctx: RequestContext, + ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, @@ -1214,7 +1214,7 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, &ctx).await?; + timeline.wait_lsn(lsn, ctx).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -1236,7 +1236,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } else { @@ -1257,7 +1257,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; // shutdown the encoder to ensure the gzip footer is written @@ -1269,7 +1269,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } @@ -1449,25 +1449,25 @@ where false }; - ::metrics::metric_vec_duration::observe_async_block_duration_by_result( - &*metrics::BASEBACKUP_QUERY_TIME, - async move { - self.handle_basebackup_request( - pgb, - tenant_id, - timeline_id, - lsn, - None, - false, - gzip, - ctx, - ) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - Result::<(), QueryError>::Ok(()) - }, - ) - .await?; + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let res = async { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + gzip, + &ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Result::<(), QueryError>::Ok(()) + } + .await; + metric_recording.observe(&res); + res?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -1563,7 +1563,7 @@ where prev_lsn, true, false, - ctx, + &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;