diff --git a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js index 402039c2ee8b6..fe85ed8939727 100644 --- a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js +++ b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js @@ -155,9 +155,6 @@ function searchForAndAssertPartialResults(initialTimeoutMS, queryFunc) { // Try to get partial results, while nudging timeout value around the expected time. // This first case will try to get all the results in one big batch. -// Start with half of the full runtime of the query. - -// Fetch one big batch of results. searchForAndAssertPartialResults(Math.round(fullQueryTimeoutMS), runBigBatchQuery); // Try to get partial results in a getMore. diff --git a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js b/jstests/serial_run/allow_partial_results_with_maxTimeMS_failpoints.js similarity index 97% rename from jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js rename to jstests/serial_run/allow_partial_results_with_maxTimeMS_failpoints.js index c7adafdb52421..4f5286e135d50 100644 --- a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js +++ b/jstests/serial_run/allow_partial_results_with_maxTimeMS_failpoints.js @@ -3,6 +3,9 @@ * with 'maxTimeMS' and only a subset of the shards provide data before the timeout. * Uses three methods to simulate MaxTimeMSExpired: failpoints, MongoBridge, and $where + sleep. * + * This is in the serial_run suite because the tests using MongoBridge and $where + sleep depend on + * certain work being completed before a maxTimeMS timeout. + * * @tags: [ * requires_sharding, * requires_replication, @@ -280,8 +283,6 @@ const shard0SleepFailure = new FindWhereSleepController(st.shard0); const shard1SleepFailure = new FindWhereSleepController(st.shard1); const allShardsSleepFailure = new MultiController([shard0SleepFailure, shard1SleepFailure]); -const allshardsMixedFailures = new MultiController([shard0NetworkFailure, shard1Failpoint]); - // Due to the hack with sleepFailures below, this has to be the innermost parameterizing function. function withEachSingleShardFailure(callback) { callback(shard0Failpoint); @@ -302,7 +303,6 @@ function withEachAllShardFailure(callback) { callback(allShardsFailpoint); callback(allshardsNetworkFailure); callback(allShardsSleepFailure); - callback(allshardsMixedFailures); } function getMoreShardTimeout(allowPartialResults, failureController, batchSize) { @@ -375,7 +375,7 @@ function partialResultsTrueFirstBatch(failureController) { assert.eq(0, res.cursor.id); failureController.disable(); } -withEachSingleShardFailure(failure => partialResultsTrueFirstBatch(failure)); +withEachSingleShardFailure(partialResultsTrueFirstBatch); // With 'allowPartialResults: false', if one shard times out then return a timeout error. function partialResultsFalseOneFailure(failureController) { @@ -383,7 +383,7 @@ function partialResultsFalseOneFailure(failureController) { assert.commandFailedWithCode(runQuery(false), ErrorCodes.MaxTimeMSExpired); failureController.disable(); } -withEachSingleShardFailure(failure => partialResultsFalseOneFailure(failure)); +withEachSingleShardFailure(partialResultsFalseOneFailure); // With 'allowPartialResults: false', if both shards time out then return a timeout error. function allowPartialResultsFalseAllFailed(failureController) { @@ -391,7 +391,7 @@ function allowPartialResultsFalseAllFailed(failureController) { assert.commandFailedWithCode(runQuery(false), ErrorCodes.MaxTimeMSExpired); failureController.disable(); } -withEachAllShardFailure(failure => allowPartialResultsFalseAllFailed(failure)); +withEachAllShardFailure(allowPartialResultsFalseAllFailed); // With 'allowPartialResults: true', if both shards time out then return empty "partial" results. function allowPartialResultsTrueAllFailed(failureController) { @@ -402,7 +402,7 @@ function allowPartialResultsTrueAllFailed(failureController) { assert.eq(res.cursor.firstBatch.length, 0); failureController.disable(); } -withEachAllShardFailure(failure => allowPartialResultsTrueAllFailed(failure)); +withEachAllShardFailure(allowPartialResultsTrueAllFailed); st.stop(); }()); diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index 24c92e19f6bc2..bbdb3b35c5f7e 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -220,10 +220,6 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) { } // namespace Status OperationContext::checkForInterruptNoAssert() noexcept { - if (_noReplStateChangeWhileIgnoringOtherInterrupts()) { - return Status::OK(); - } - // TODO: Remove the MONGO_likely(hasClientAndServiceContext) once all operation contexts are // constructed with clients. const auto hasClientAndServiceContext = getClient() && getServiceContext(); @@ -313,8 +309,7 @@ StatusWith OperationContext::waitForConditionOrInterruptNoAsser // maxTimeNeverTimeOut is set) then we assume that the incongruity is due to a clock mismatch // and return _timeoutError regardless. To prevent this behaviour, only consider the op's // deadline in the event that the maxTimeNeverTimeOut failpoint is not set. - bool opHasDeadline = (hasDeadline() && !_noReplStateChangeWhileIgnoringOtherInterrupts() && - !MONGO_unlikely(maxTimeNeverTimeOut.shouldFail())); + bool opHasDeadline = (hasDeadline() && !MONGO_unlikely(maxTimeNeverTimeOut.shouldFail())); if (opHasDeadline) { deadline = std::min(deadline, getDeadline()); diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index 95e561646ccd6..ef4b4a29056fc 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -514,14 +514,6 @@ class OperationContext : public Interruptible, public DecorablegetCancellationToken().isCanceled()); } -TEST_F(OperationDeadlineTests, - MaxTimeExpirationIgnoredWhenIgnoringInterruptsExceptReplStateChange) { - auto opCtx = client->makeOperationContext(); - opCtx->setDeadlineByDate(mockClock->now(), ErrorCodes::MaxTimeMSExpired); - auto m = MONGO_MAKE_LATCH(); - stdx::condition_variable cv; - stdx::unique_lock lk(m); - ASSERT_FALSE(opCtx->getCancellationToken().isCanceled()); - opCtx->setIgnoreInterruptsExceptForReplStateChange(true); - // Advance the clock so the MaxTimeMS is hit before the timeout. - mockClock->advance(Milliseconds(100)); - ASSERT_FALSE( - opCtx->waitForConditionOrInterruptUntil(cv, lk, mockClock->now(), [] { return false; })); - ASSERT_FALSE(opCtx->getCancellationToken().isCanceled()); -} - -TEST_F(OperationDeadlineTests, - AlreadyExpiredMaxTimeIgnoredWhenIgnoringInterruptsExceptReplStateChange) { - auto opCtx = client->makeOperationContext(); - opCtx->setDeadlineByDate(mockClock->now(), ErrorCodes::MaxTimeMSExpired); - auto m = MONGO_MAKE_LATCH(); - stdx::condition_variable cv; - stdx::unique_lock lk(m); - ASSERT_FALSE(opCtx->getCancellationToken().isCanceled()); - ASSERT_THROWS_CODE(opCtx->waitForConditionOrInterruptUntil( - cv, lk, mockClock->now() + Seconds(1), [] { return false; }), - DBException, - ErrorCodes::MaxTimeMSExpired); - - ASSERT_EQ(ErrorCodes::MaxTimeMSExpired, opCtx->checkForInterruptNoAssert()); - ASSERT_TRUE(opCtx->getCancellationToken().isCanceled()); - opCtx->setIgnoreInterruptsExceptForReplStateChange(true); - ASSERT_OK(opCtx->checkForInterruptNoAssert()); - // Advance the clock so the MaxTimeMS is hit before the timeout. - mockClock->advance(Milliseconds(100)); - ASSERT_FALSE( - opCtx->waitForConditionOrInterruptUntil(cv, lk, mockClock->now(), [] { return false; })); - ASSERT_TRUE(opCtx->getCancellationToken().isCanceled()); -} - -TEST_F(OperationDeadlineTests, - MaxTimeRespectedAfterReplStateChangeWhenIgnoringInterruptsExceptReplStateChange) { - auto opCtx = client->makeOperationContext(); - opCtx->setDeadlineByDate(mockClock->now(), ErrorCodes::MaxTimeMSExpired); - auto m = MONGO_MAKE_LATCH(); - stdx::condition_variable cv; - stdx::unique_lock lk(m); - ASSERT_FALSE(opCtx->getCancellationToken().isCanceled()); - ASSERT_THROWS_CODE(opCtx->waitForConditionOrInterruptUntil( - cv, lk, mockClock->now() + Seconds(1), [] { return false; }), - DBException, - ErrorCodes::MaxTimeMSExpired); - - ASSERT_EQ(ErrorCodes::MaxTimeMSExpired, opCtx->checkForInterruptNoAssert()); - ASSERT_TRUE(opCtx->getCancellationToken().isCanceled()); - opCtx->setIgnoreInterruptsExceptForReplStateChange(true); - ASSERT_OK(opCtx->checkForInterruptNoAssert()); - opCtx->markKilled(ErrorCodes::InterruptedDueToReplStateChange); - ASSERT_EQ(ErrorCodes::MaxTimeMSExpired, opCtx->checkForInterruptNoAssert()); - // Advance the clock so the MaxTimeMS is hit before the timeout. - mockClock->advance(Milliseconds(100)); - ASSERT_THROWS_CODE( - opCtx->waitForConditionOrInterruptUntil(cv, lk, mockClock->now(), [] { return false; }), - DBException, - ErrorCodes::MaxTimeMSExpired); - ASSERT_TRUE(opCtx->getCancellationToken().isCanceled()); -} - class ThreadedOperationDeadlineTests : public OperationDeadlineTests { public: using CvPred = std::function; diff --git a/src/mongo/s/query/cluster_find.cpp b/src/mongo/s/query/cluster_find.cpp index 68dfdf289de9d..c164432e2452f 100644 --- a/src/mongo/s/query/cluster_find.cpp +++ b/src/mongo/s/query/cluster_find.cpp @@ -314,7 +314,28 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, try { // Establish the cursors with a consistent shardVersion across shards. - params.remotes = establishCursorsOnShards(shardIds); + + // If we have maxTimeMS and allowPartialResults, then leave some spare time in the opCtx + // deadline so that we have time to return partial results before the opCtx is killed. + auto deadline = opCtx->getDeadline(); + if (findCommand.getAllowPartialResults() && findCommand.getMaxTimeMS()) { + // Reserve 10% of the time budget (up to 100,000 microseconds max) for processing + // buffered partial results. + deadline -= Microseconds{std::min(1000 * (*findCommand.getMaxTimeMS()) / 10, 100000)}; + LOGV2_DEBUG( + 5746901, + 0, + "Setting an earlier artificial deadline because the find allows partial results.", + "deadline"_attr = deadline); + } + + // The call to establishCursorsOnShards has its own timeout mechanism that is controlled + // by the opCtx, so we don't expect runWithDeadline to throw a timeout at this level. We + // use runWithDeadline because it has the side effect of pushing a temporary + // (artificial) deadline onto the opCtx used by establishCursorsOnShards. + opCtx->runWithDeadline(deadline, ErrorCodes::MaxTimeMSExpired, [&]() -> void { + params.remotes = establishCursorsOnShards(shardIds); + }); } catch (const DBException& ex) { if (ex.code() == ErrorCodes::CollectionUUIDMismatch && !ex.extraInfo()->actualCollection() && @@ -352,15 +373,10 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, FindCommon::waitInFindBeforeMakingBatch(opCtx, query); - // If we're allowing partial results and we got MaxTimeMSExpired, then temporarily disable - // interrupts in the opCtx so that we can pull already-fetched data from ClusterClientCursor. - bool ignoringInterrupts = false; if (findCommand.getAllowPartialResults() && opCtx->checkForInterruptNoAssert().code() == ErrorCodes::MaxTimeMSExpired) { - // MaxTimeMS is expired, but perhaps remotes not have expired their requests yet. - // Wait for all remote cursors to be exhausted so that we can safely disable interrupts - // in the opCtx. We want to be sure that later calls to ccc->next() do not block on - // more data. + // MaxTimeMS is expired in the router, but some remotes may still have outsanding requests. + // Wait for all remotes to expire their requests. // Maximum number of 1ms sleeps to wait for remote cursors to be exhausted. constexpr int kMaxAttempts = 10; @@ -377,18 +393,6 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, } stdx::this_thread::sleep_for(stdx::chrono::milliseconds(1)); } - - // The first MaxTimeMSExpired will have called opCtx->markKilled() so any later - // call to opCtx->checkForInterruptNoAssert() will return an error. We need to - // temporarily ignore this while we pull data from the ClusterClientCursor. - LOGV2_DEBUG( - 5746901, - 0, - "Attempting to return partial results because MaxTimeMS expired and the query set " - "AllowPartialResults. Temporarily disabling interrupts on the OperationContext " - "while partial results are pulled from the ClusterClientCursor."); - opCtx->setIgnoreInterruptsExceptForReplStateChange(true); - ignoringInterrupts = true; } auto cursorState = ClusterCursorManager::CursorState::NotExhausted; @@ -398,20 +402,13 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, // results come from the initial batches that were obtained when establishing cursors, but // ClusterClientCursor::next will fetch further results if necessary. while (!FindCommon::enoughForFirstBatch(findCommand, results->size())) { - auto nextWithStatus = ccc->next(); - if (findCommand.getAllowPartialResults() && - (nextWithStatus.getStatus() == ErrorCodes::MaxTimeMSExpired)) { + auto next = uassertStatusOK(ccc->next()); + if (findCommand.getAllowPartialResults()) { if (ccc->remotesExhausted()) { cursorState = ClusterCursorManager::CursorState::Exhausted; - break; } - // Continue because there may be results waiting from other remotes. - continue; - } else { - // all error statuses besides permissible remote timeouts should be returned to the user - uassertStatusOK(nextWithStatus); } - if (nextWithStatus.getValue().isEOF()) { + if (next.isEOF()) { // We reached end-of-stream. If the cursor is not tailable, then we mark it as // exhausted. If it is tailable, usually we keep it open (i.e. "NotExhausted") even // when we reach end-of-stream. However, if all the remote cursors are exhausted, there @@ -422,7 +419,7 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, break; } - auto nextObj = *(nextWithStatus.getValue().getResult()); + auto nextObj = *next.getResult(); // If adding this object will cause us to exceed the message size limit, then we stash it // for later. @@ -437,19 +434,6 @@ CursorId runQueryWithoutRetrying(OperationContext* opCtx, results->push_back(std::move(nextObj)); } - if (ignoringInterrupts) { - opCtx->setIgnoreInterruptsExceptForReplStateChange(false); - ignoringInterrupts = false; - LOGV2_DEBUG(5746902, 0, "Re-enabled interrupts on the OperationContext."); - } - - // Surface any opCtx interrupts, except ignore MaxTimeMSExpired with allowPartialResults. - auto interruptStatus = opCtx->checkForInterruptNoAssert(); - if (!(interruptStatus.code() == ErrorCodes::MaxTimeMSExpired && - findCommand.getAllowPartialResults())) { - uassertStatusOK(interruptStatus); - } - ccc->detachFromOperationContext(); if (findCommand.getSingleBatch() && !ccc->isTailable()) {