Skip to content

Commit

Permalink
SERVER-29937 Make sure liveness timeouts cannot be missed
Browse files Browse the repository at this point in the history
  • Loading branch information
judahschvimer committed Sep 15, 2017
1 parent 84cb3ec commit f1bf0b3
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 11 deletions.
28 changes: 17 additions & 11 deletions src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
Expand Up @@ -688,18 +688,24 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock() {
}

auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod();
if (nextTimeout > _replExecutor->now()) {
LOG(3) << "scheduling next check at " << nextTimeout;
auto cbh = _scheduleWorkAt(nextTimeout,
stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout,
this,
stdx::placeholders::_1));
if (!cbh) {
return;
}
_handleLivenessTimeoutCbh = cbh;
_earliestMemberId = earliestMemberId;
LOG(3) << "scheduling next check at " << nextTimeout;

// It is possible we will schedule the next timeout in the past.
// ThreadPoolTaskExecutor::_scheduleWorkAt() schedules its work immediately if it's given a
// time <= now().
// If we missed the timeout, it means that on our last check the earliest live member was
// just barely fresh and it has become stale since then. We must schedule another liveness
// check to continue conducting liveness checks and be able to step down from primary if we
// lose contact with a majority of nodes.
auto cbh = _scheduleWorkAt(nextTimeout,
stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout,
this,
stdx::placeholders::_1));
if (!cbh) {
return;
}
_handleLivenessTimeoutCbh = cbh;
_earliestMemberId = earliestMemberId;
}

void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) {
Expand Down
2 changes: 2 additions & 0 deletions src/mongo/executor/task_executor.h
Expand Up @@ -200,6 +200,8 @@ class TaskExecutor {
/**
* Schedules "work" to be run by the executor no sooner than "when".
*
* If "when" is <= now(), then it schedules the "work" to be run ASAP.
*
* Returns a handle for waiting on or canceling the callback, or
* ErrorCodes::ShutdownInProgress.
*
Expand Down
8 changes: 8 additions & 0 deletions src/mongo/executor/task_executor_test_common.cpp
Expand Up @@ -330,14 +330,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) {
Status status1 = getDetectableErrorStatus();
Status status2 = getDetectableErrorStatus();
Status status3 = getDetectableErrorStatus();
Status status4 = getDetectableErrorStatus();

const Date_t now = net->now();
const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1)));
const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt(
now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4)));
unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3)));
const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(200),
stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2)));

executor.wait(cb4);
ASSERT_OK(status4);

const Date_t startTime = net->now();
net->enterNetwork();
net->runUntil(startTime + Milliseconds(200));
Expand Down

0 comments on commit f1bf0b3

Please sign in to comment.