Skip to content

Commit

Permalink
further refine the wait_lsn alerts (is this really worth it? IDK)
Browse files Browse the repository at this point in the history
  • Loading branch information
problame committed Mar 28, 2024
1 parent 97c0485 commit c0d5cea
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 10 deletions.
24 changes: 21 additions & 3 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,13 @@ impl PageServerHandler {
if lsn <= last_record_lsn {
lsn = last_record_lsn;
} else {
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
Expand All @@ -888,7 +894,13 @@ impl PageServerHandler {
"invalid LSN(0) in request".into(),
));
}
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
}

if lsn < **latest_gc_cutoff_lsn {
Expand Down Expand Up @@ -1215,7 +1227,13 @@ impl PageServerHandler {
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.context("invalid basebackup lsn")?;
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1515,7 +1515,7 @@ impl Tenant {
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline
.wait_lsn(*lsn, ctx)
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
Expand Down
9 changes: 8 additions & 1 deletion pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,14 @@ impl TenantManager {
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
"failpoint"
)));
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
if let Err(e) = timeline
.wait_lsn(
*target_lsn,
crate::tenant::timeline::WaitLsnWaiter::Tenant,
ctx,
)
.await
{
// Failure here might mean shutdown, in any case this part is an optimization
// and we shouldn't hold up the split operation.
tracing::warn!(
Expand Down
25 changes: 20 additions & 5 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,12 @@ pub enum GetVectoredImpl {
Vectored,
}

pub(crate) enum WaitLsnWaiter<'a> {
Timeline(&'a Timeline),
Tenant,
PageService,
}

/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
Expand Down Expand Up @@ -1058,6 +1064,7 @@ impl Timeline {
pub(crate) async fn wait_lsn(
&self,
lsn: Lsn,
who_is_waiting: WaitLsnWaiter<'_>,
ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
Expand All @@ -1071,10 +1078,18 @@ impl Timeline {
TaskKind::WalReceiverManager
| TaskKind::WalReceiverConnectionHandler
| TaskKind::WalReceiverConnectionPoller => {
// This should never be called from the WAL receiver, because that could lead
// to a deadlock.
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
panic!("walingest task is calling wait_lsn {lsn} but current is only {current}, would deadlock");
let is_myself = match who_is_waiting {
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
};
if is_myself {
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
}
} else {
// if another timeline's is waiting for us, there's no deadlock risk because
// our walreceiver task can make progress independent of theirs
}
}
_ => {}
Expand Down Expand Up @@ -3039,7 +3054,7 @@ impl Timeline {
}
}
ancestor
.wait_lsn(self.ancestor_lsn, ctx)
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
Expand Down

0 comments on commit c0d5cea

Please sign in to comment.