Skip to content

Commit

Permalink
Don't allow two timeline_delete operations to run concurrently.
Browse files Browse the repository at this point in the history
If the timeline is already being deleted, return an error. We used to
notice the duplicate request and error out in
persist_index_part_with_deleted_flag(), but it's better to detect it
earlier. Add an explicit flag for the deletion.

Note: This doesn't do anything about the async cancellation problem
(github issue #3478): if the original HTTP request dropped, because
the client disconnected, the timeline deletion stops half-way through
the operation. That needs to be fixed, too, but that's a separate
story.
  • Loading branch information
hlinnaka committed May 23, 2023
1 parent 00f7fc3 commit 1673a92
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 5 deletions.
35 changes: 31 additions & 4 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1396,7 +1396,11 @@ impl Tenant {

// Transition the timeline into TimelineState::Stopping.
// This should prevent new operations from starting.
let timeline = {
//
// Also grab the Timeline's delete_lock to prevent another deletion from starting.
let timeline;
let mut delete_lock_guard;
{
let mut timelines = self.timelines.lock().unwrap();

// Ensure that there are no child timelines **attached to that pageserver**,
Expand All @@ -1414,12 +1418,29 @@ impl Tenant {
Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
};

let timeline = Arc::clone(timeline_entry.get());
timeline = Arc::clone(timeline_entry.get());

// Prevent two tasks from trying to delete the timeline at the same time.
//
// XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller
// needs to poll until the operation has finished. But for now, we return an
// error, because the control plane knows to retry errors.
delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| {
DeleteTimelineError::Other(anyhow::anyhow!(
"timeline deletion is already in progress"
))
})?;

// If the other task already finished the deletion, return success.
if *delete_lock_guard {
// already deleted by a concurrent task
return Ok(());
}

timeline.set_state(TimelineState::Stopping);

drop(timelines);
timeline
};
}

// Now that the Timeline is in Stopping state, request all the related tasks to
// shut down.
Expand Down Expand Up @@ -1465,6 +1486,10 @@ impl Tenant {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
Expand Down Expand Up @@ -1576,6 +1601,8 @@ impl Tenant {
}
drop(timelines);

*delete_lock_guard = true;

Ok(())
}

Expand Down
5 changes: 5 additions & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ pub struct Timeline {

state: watch::Sender<TimelineState>,

/// Prevent two tasks from deleting the timeline at the same time. If held, the
/// timeline is being deleted. If 'true', the timeline has already been deleted.
pub delete_lock: tokio::sync::Mutex<bool>,

eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
}

Expand Down Expand Up @@ -1421,6 +1425,7 @@ impl Timeline {
eviction_task_timeline_state: tokio::sync::Mutex::new(
EvictionTaskTimelineState::default(),
),
delete_lock: tokio::sync::Mutex::new(false),
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_timeline_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def first_call_hit_failpoint():

# make the second call and assert behavior
log.info("second call start")
error_msg_re = "another task is already setting the deleted_flag, started at"
error_msg_re = "timeline deletion is already in progress"
with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
assert second_call_err.value.status_code == 500
Expand Down

0 comments on commit 1673a92

Please sign in to comment.