Skip to content

Commit

Permalink
WIP: fix: logical size limit is broken during PS restart
Browse files Browse the repository at this point in the history
fixes #5963

On top of #6000

Will ship this in a release after #600
  • Loading branch information
problame committed Dec 1, 2023
1 parent ef1848f commit 57a7b2e
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 51 deletions.
62 changes: 12 additions & 50 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1738,34 +1738,18 @@ impl Timeline {
) -> logical_size::CurrentLogicalSize {
let current_size = self.current_logical_size.current_size();
debug!("Current size: {current_size:?}");

match (current_size.accuracy(), priority) {
(logical_size::Accuracy::Exact, _) => (), // nothing to do
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
// background task will eventually deliver an exact value, we're in no rush
}
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
// background task is not ready, but user is asking for it now;
// => make the background task skip the line
// (The alternative would be to calculate the size here, but,
// it can actually take a long time if the user has a lot of rels.
// And we'll inevitable need it again; So, let the background task do the work.)
match self
.current_logical_size
.cancel_wait_for_background_loop_concurrency_limit_semaphore
.get()
{
Some(cancel) => cancel.cancel(),
None => {
warn!("unexpected: priority_tx not set, logical size calculation will not be prioritized");
}
};
}
}

current_size
}

// if it's not already computed, it computes it _now_
pub(crate) async fn get_current_logical_size_wait_exact(
self: &Arc<Self>,
) -> Result<logical_size::Exact, TimelineCancelled | CalculationError> {
self.current_logical_size.initial_logical_size.get_or_try_init(async {
// do calcualtion here
})
}

fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
// nothing to do for freshly created timelines;
Expand Down Expand Up @@ -1832,31 +1816,9 @@ impl Timeline {
&cancel,
);

use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, circumstances) = tokio::select! {
res = wait_for_permit => {
match res {
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
Err(RateLimitError::Cancelled) => {
return Err(BackgroundCalculationError::Cancelled);
}
}
}
() = skip_concurrency_limiter.cancelled() => {
// Some action that is part of a end user interaction requested logical size
// => break out of the rate limit
// TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
// but then again what happens if they cancel; also, we should just be using
// one runtime across the entire process, so, let's leave this for now.
(None, StartCircumstances::SkippedConcurrencyLimiter)
}
};

let metrics_guard = if attempt == 1 {
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
} else {
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
};
self.current_logical_size.initial_logical_size.get_or_init(async {
// do calcualtion here
});

match self_ref
.logical_size_calculation_task(
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/timeline/logical_size.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ pub(super) struct LogicalSize {
///
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
/// the initial size at a different LSN.
pub initial_logical_size: OnceCell<(
pub initial_logical_size: tokio::sync::OnceCell<(
u64,
crate::metrics::initial_logical_size::FinishedCalculationGuard,
)>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ pub(super) async fn connection_manager_loop_step(

if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
info!("Switching to new connection candidate: {new_candidate:?}");
tokio::select! {
logical_size = connection_manager_state.timeline.get_current_logical_size_wait_exact().await,
_ = connection_manager.should_shutdown(),
}
connection_manager_state
.change_connection(new_candidate, ctx)
.await
Expand Down

0 comments on commit 57a7b2e

Please sign in to comment.