From 1673a92bdb0a494bef6ff51e35031d66b814858f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 24 May 2023 00:06:00 +0300 Subject: [PATCH] Don't allow two timeline_delete operations to run concurrently. If the timeline is already being deleted, return an error. We used to notice the duplicate request and error out in persist_index_part_with_deleted_flag(), but it's better to detect it earlier. Add an explicit flag for the deletion. Note: This doesn't do anything about the async cancellation problem (github issue #3478): if the original HTTP request dropped, because the client disconnected, the timeline deletion stops half-way through the operation. That needs to be fixed, too, but that's a separate story. --- pageserver/src/tenant.rs | 35 ++++++++++++++++++--- pageserver/src/tenant/timeline.rs | 5 +++ test_runner/regress/test_timeline_delete.py | 2 +- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8349e1993fd7..a05de1618a84 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1396,7 +1396,11 @@ impl Tenant { // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. - let timeline = { + // + // Also grab the Timeline's delete_lock to prevent another deletion from starting. + let timeline; + let mut delete_lock_guard; + { let mut timelines = self.timelines.lock().unwrap(); // Ensure that there are no child timelines **attached to that pageserver**, @@ -1414,12 +1418,29 @@ impl Tenant { Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), }; - let timeline = Arc::clone(timeline_entry.get()); + timeline = Arc::clone(timeline_entry.get()); + + // Prevent two tasks from trying to delete the timeline at the same time. + // + // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller + // needs to poll until the operation has finished. But for now, we return an + // error, because the control plane knows to retry errors. + delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| { + DeleteTimelineError::Other(anyhow::anyhow!( + "timeline deletion is already in progress" + )) + })?; + + // If the other task already finished the deletion, return success. + if *delete_lock_guard { + // already deleted by a concurrent task + return Ok(()); + } + timeline.set_state(TimelineState::Stopping); drop(timelines); - timeline - }; + } // Now that the Timeline is in Stopping state, request all the related tasks to // shut down. @@ -1465,6 +1486,10 @@ impl Tenant { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); @@ -1576,6 +1601,8 @@ impl Tenant { } drop(timelines); + *delete_lock_guard = true; + Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c47f4444f533..a9471cf4aa11 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -226,6 +226,10 @@ pub struct Timeline { state: watch::Sender, + /// Prevent two tasks from deleting the timeline at the same time. If held, the + /// timeline is being deleted. If 'true', the timeline has already been deleted. + pub delete_lock: tokio::sync::Mutex, + eviction_task_timeline_state: tokio::sync::Mutex, } @@ -1421,6 +1425,7 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), + delete_lock: tokio::sync::Mutex::new(false), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7135b621cbed..99bf4002079f 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -371,7 +371,7 @@ def first_call_hit_failpoint(): # make the second call and assert behavior log.info("second call start") - error_msg_re = "another task is already setting the deleted_flag, started at" + error_msg_re = "timeline deletion is already in progress" with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 500