From de596b3b1f84f0d4ba56d6c4b7b66319333e39a0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 8 Mar 2024 19:05:38 +0000
Subject: [PATCH] throttling: exclude throttled time from basebackup (fixup of
 #6953)

PR #6953 only excluded throttled time from the handle_pagerequests
(aka smgr metrics).

This PR implements the deduction for `basebackup ` queries.

The other page_service methods either don't use Timeline::get
or they aren't used in production.
---
 libs/metrics/src/lib.rs                 |  1 -
 libs/metrics/src/metric_vec_duration.rs | 23 ---------
 pageserver/src/metrics.rs               | 63 +++++++++++++++++++++++--
 pageserver/src/page_service.rs          | 50 ++++++++++----------
 4 files changed, 83 insertions(+), 54 deletions(-)
 delete mode 100644 libs/metrics/src/metric_vec_duration.rs
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 744fc18e61df..22b0a18933f5 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs
deleted file mode 100644
index e9a0a655706f..000000000000
--- a/libs/metrics/src/metric_vec_duration.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee62ee0367e8..27e754e9990a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,4 @@
 use enum_map::EnumMap;
-use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     })
 });
 
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    parent: &'a BasebackupQueryTime,
+    ctx: &'c RequestContext,
+    start: std::time::Instant,
+}
+
+impl BasebackupQueryTime {
+    pub(crate) fn start_recording<'c: 'a, 'a>(
+        &'a self,
+        ctx: &'c RequestContext,
+    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
+        BasebackupQueryTimeOngoingRecording {
+            parent: self,
+            ctx,
+            start,
+        }
+    }
+}
+
+impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(ex_throttled) => ex_throttled,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
         let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
+        metric.observe(ex_throttled.as_secs_f64());
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index dacee41e6ecb..f3ceb7d3e6e5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, &ctx).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
-                &ctx,
+                ctx,
             )
             .await?;
         } else {
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
                 // shutdown the encoder to ensure the gzip footer is written
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
             }
@@ -1449,25 +1449,25 @@ where
                 false
             };
 
-            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        gzip,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    Result::<(), QueryError>::Ok(())
-                },
-            )
-            .await?;
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+            let res = async {
+                self.handle_basebackup_request(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    lsn,
+                    None,
+                    false,
+                    gzip,
+                    &ctx,
+                )
+                .await?;
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
+            }
+            .await;
+            metric_recording.observe(&res);
+            res?;
         }
         // return pair of prev_lsn and last_lsn
         else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1563,7 +1563,7 @@ where
                 prev_lsn,
                 true,
                 false,
-                ctx,
+                &ctx,
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;