From 673bb299696b4afc16c2e2ebaa9a35600ec2969c Mon Sep 17 00:00:00 2001 From: Siyuan Zhou Date: Thu, 23 Apr 2015 16:21:37 -0400 Subject: [PATCH] SERVER-5218 Batch oplog writes always wait for journal. --- .../db/commands/cleanup_orphaned_cmd.cpp | 2 +- .../write_commands/batch_executor.cpp | 34 +------------------ .../commands/write_commands/batch_executor.h | 2 -- src/mongo/db/range_deleter.cpp | 2 +- src/mongo/db/repl/initial_sync.h | 2 +- src/mongo/db/repl/oplog.cpp | 6 ---- src/mongo/db/repl/replica_set_config.cpp | 4 +-- .../db/repl/replication_coordinator_impl.cpp | 4 +-- .../replication_coordinator_impl_test.cpp | 10 +++--- src/mongo/db/repl/sync_tail.cpp | 21 +++++------- src/mongo/db/repl/sync_tail.h | 7 ++-- src/mongo/db/write_concern.cpp | 19 ++++++++++- src/mongo/db/write_concern_options.cpp | 2 ++ src/mongo/db/write_concern_options.h | 2 ++ src/mongo/s/d_migrate.cpp | 2 +- 15 files changed, 46 insertions(+), 73 deletions(-) diff --git a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp index 89f57103958c0..a6ff2b90a6d12 100644 --- a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp +++ b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp @@ -53,7 +53,7 @@ namespace { using mongo::WriteConcernOptions; const int kDefaultWTimeoutMs = 60 * 1000; - const WriteConcernOptions DefaultWriteConcern("majority", + const WriteConcernOptions DefaultWriteConcern(WriteConcernOptions::kMajority, WriteConcernOptions::NONE, kDefaultWTimeoutMs); } diff --git a/src/mongo/db/commands/write_commands/batch_executor.cpp b/src/mongo/db/commands/write_commands/batch_executor.cpp index ae57fdbecbb21..1c12d27227033 100644 --- a/src/mongo/db/commands/write_commands/batch_executor.cpp +++ b/src/mongo/db/commands/write_commands/batch_executor.cpp @@ -735,32 +735,13 @@ namespace mongo { std::vector* upsertedIds, std::vector* errors ) { - WriteConcernOptions originalWC = _txn->getWriteConcern(); - - // We adjust the write concern attached to the OperationContext to not wait for - // journal. Later, the code will restore the write concern to wait for journal on - // the last write of the batch. - if (request.sizeWriteOps() > 1 - && originalWC.syncMode == WriteConcernOptions::JOURNAL) - { - WriteConcernOptions writeConcern = originalWC; - writeConcern.syncMode = WriteConcernOptions::NONE; - _txn->setWriteConcern(writeConcern); - } - if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { - execInserts( request, originalWC, errors ); + execInserts( request, errors ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) { if ( i + 1 == request.sizeWriteOps() ) { - // For the last write in the batch, restore the write concern back to the - // original provided one; this may set WriteConcernOptions::JOURNAL back - // to true. - _txn->setWriteConcern(originalWC); - // Use the original write concern to possibly await the commit of this write, - // in order to flush the journal as requested. setupSynchronousCommit( _txn ); } @@ -787,12 +768,6 @@ namespace mongo { for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) { if ( i + 1 == request.sizeWriteOps() ) { - // For the last write in the batch, restore the write concern back to the - // original provided one; this may set WriteConcernOptions::JOURNAL back - // to true. - _txn->setWriteConcern(originalWC); - // Use the original write concern to possibly await the commit of this write, - // in order to flush the journal as requested. setupSynchronousCommit( _txn ); } @@ -837,7 +812,6 @@ namespace mongo { } void WriteBatchExecutor::execInserts( const BatchedCommandRequest& request, - const WriteConcernOptions& originalWC, std::vector* errors ) { // Theory of operation: @@ -882,12 +856,6 @@ namespace mongo { ++state.currIndex) { if (state.currIndex + 1 == state.request->sizeWriteOps()) { - // For the last write in the batch, restore the write concern back to the - // original provided one; this may set WriteConcernOptions::JOURNAL back - // to true. - _txn->setWriteConcern(originalWC); - // Use the original write concern to possibly await the commit of this write, - // in order to flush the journal as requested. setupSynchronousCommit(_txn); } diff --git a/src/mongo/db/commands/write_commands/batch_executor.h b/src/mongo/db/commands/write_commands/batch_executor.h index e5d55a5b9fb46..6216ae65c899e 100644 --- a/src/mongo/db/commands/write_commands/batch_executor.h +++ b/src/mongo/db/commands/write_commands/batch_executor.h @@ -33,7 +33,6 @@ #include "mongo/base/disallow_copying.h" #include "mongo/db/ops/update_request.h" -#include "mongo/db/write_concern_options.h" #include "mongo/s/write_ops/batched_command_request.h" #include "mongo/s/write_ops/batched_command_response.h" #include "mongo/s/write_ops/batched_delete_document.h" @@ -95,7 +94,6 @@ namespace mongo { * times. */ void execInserts( const BatchedCommandRequest& request, - const WriteConcernOptions& originalWC, std::vector* errors ); /** diff --git a/src/mongo/db/range_deleter.cpp b/src/mongo/db/range_deleter.cpp index e754fb9fb0278..bde2e3ad36c04 100644 --- a/src/mongo/db/range_deleter.cpp +++ b/src/mongo/db/range_deleter.cpp @@ -269,7 +269,7 @@ namespace { const int kWTimeoutMillis = 60 * 60 * 1000; bool _waitForMajority(OperationContext* txn, std::string* errMsg) { - const WriteConcernOptions writeConcern("majority", + const WriteConcernOptions writeConcern(WriteConcernOptions::kMajority, WriteConcernOptions::NONE, kWTimeoutMillis); diff --git a/src/mongo/db/repl/initial_sync.h b/src/mongo/db/repl/initial_sync.h index 2cf41660d94aa..bf5800d4643b6 100644 --- a/src/mongo/db/repl/initial_sync.h +++ b/src/mongo/db/repl/initial_sync.h @@ -48,7 +48,7 @@ namespace repl { */ void oplogApplication(OperationContext* txn, const Timestamp& endOpTime); - // Initial sync will ignore all journal requirement flags and dones't await commit + // Initial sync will ignore all journal requirement flags and doesn't await commit // before updating last OpTime. virtual bool supportsAwaitingCommit() { return false; } }; diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 7032cf1994d7f..15f51bee56d97 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -302,12 +302,6 @@ namespace { b.appendBool("fromMigrate", true); } - if (txn->getWriteConcern().shouldWaitForOtherNodes() - && txn->getWriteConcern().syncMode == WriteConcernOptions::JOURNAL) - { - b.appendBool("j", true); - } - if ( o2 ) { b.append("o2", *o2); } diff --git a/src/mongo/db/repl/replica_set_config.cpp b/src/mongo/db/repl/replica_set_config.cpp index 256ba259863ac..1e7c343db7e89 100644 --- a/src/mongo/db/repl/replica_set_config.cpp +++ b/src/mongo/db/repl/replica_set_config.cpp @@ -359,7 +359,7 @@ namespace { } } else { - if ("majority" != _defaultWriteConcern.wMode && + if (WriteConcernOptions::kMajority != _defaultWriteConcern.wMode && !findCustomWriteMode(_defaultWriteConcern.wMode).isOK()) { return Status(ErrorCodes::BadValue, str::stream() << "Default write concern requires undefined write mode " << @@ -377,7 +377,7 @@ namespace { Status ReplicaSetConfig::checkIfWriteConcernCanBeSatisfied( const WriteConcernOptions& writeConcern) const { - if (!writeConcern.wMode.empty() && writeConcern.wMode != "majority") { + if (!writeConcern.wMode.empty() && writeConcern.wMode != WriteConcernOptions::kMajority) { StatusWith tagPatternStatus = findCustomWriteMode(writeConcern.wMode); if (!tagPatternStatus.isOK()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 34e8a3ebe265d..f8a4031317e18 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -859,7 +859,7 @@ namespace { if (!writeConcern.wMode.empty()) { StringData patternName; - if (writeConcern.wMode == "majority") { + if (writeConcern.wMode == WriteConcernOptions::kMajority) { patternName = ReplicaSetConfig::kMajorityWriteConcernModeName; } else { @@ -960,7 +960,7 @@ namespace { return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } - if (replMode == modeMasterSlave && writeConcern.wMode == "majority") { + if (replMode == modeMasterSlave && writeConcern.wMode == WriteConcernOptions::kMajority) { // with master/slave, majority is equivalent to w=1 return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index d7fcffc6ed62c..6411cb8ea09b2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -432,7 +432,7 @@ namespace { writeConcern.wNumNodes = 0; - writeConcern.wMode = "majority"; + writeConcern.wMode = WriteConcernOptions::kMajority; // w:majority always works on master/slave ReplicationCoordinator::StatusAndDuration statusAndDur = getReplCoord()->awaitReplication( &txn, time, writeConcern); @@ -574,7 +574,7 @@ namespace { // Set up valid write concerns for the rest of the test WriteConcernOptions majorityWriteConcern; majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - majorityWriteConcern.wMode = "majority"; + majorityWriteConcern.wMode = WriteConcernOptions::kMajority; WriteConcernOptions multiDCWriteConcern; multiDCWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; @@ -1817,7 +1817,7 @@ namespace { // majority nodes waiting for time WriteConcernOptions writeConcern; writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wMode = "majority"; + writeConcern.wMode = WriteConcernOptions::kMajority; ReplicationAwaiter awaiter(getReplCoord(), &txn); awaiter.setOpTime(time); @@ -1827,7 +1827,7 @@ namespace { // demonstrate that majority cannot currently be satisfied WriteConcernOptions writeConcern2; writeConcern2.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern2.wMode = "majority"; + writeConcern2.wMode = WriteConcernOptions::kMajority; ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, getReplCoord()->awaitReplication(&txn, time, writeConcern2).status); @@ -1884,7 +1884,7 @@ namespace { WriteConcernOptions majorityWriteConcern; majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - majorityWriteConcern.wMode = "majority"; + majorityWriteConcern.wMode = WriteConcernOptions::kMajority; ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index 5a997a32a093c..3c6dd40838eb6 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -53,6 +53,7 @@ #include "mongo/db/repl/minvalid.h" #include "mongo/db/repl/oplog.h" #include "mongo/db/repl/repl_client_info.h" +#include "mongo/db/repl/replica_set_config.h" #include "mongo/db/repl/replication_coordinator_global.h" #include "mongo/db/stats/timer_stats.h" #include "mongo/util/exit.h" @@ -276,9 +277,8 @@ namespace repl { } std::vector< std::vector > writerVectors(replWriterThreadCount); - bool mustAwaitCommit = false; - fillWriterVectors(ops, &writerVectors, &mustAwaitCommit); + fillWriterVectors(ops, &writerVectors); LOG(2) << "replication batch size is " << ops.size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter @@ -302,12 +302,14 @@ namespace repl { return Timestamp(); } - if (supportsAwaitingCommit() && mustAwaitCommit) { + const bool mustAwaitCommit = replCoord->isV1ElectionProtocol() && supportsAwaitingCommit(); + if (mustAwaitCommit) { txn->recoveryUnit()->goingToAwaitCommit(); } + Timestamp lastOpTime = writeOpsToOplog(txn, ops); - // Wait for journal before setting last op time if any op in batch had j:true - if (supportsAwaitingCommit() && mustAwaitCommit) { + + if (mustAwaitCommit) { txn->recoveryUnit()->awaitCommit(); } ReplClientInfo::forClient(txn->getClient()).setLastOp(lastOpTime); @@ -320,8 +322,7 @@ namespace repl { } void SyncTail::fillWriterVectors(const std::deque& ops, - std::vector< std::vector >* writerVectors, - bool* mustAwaitCommit) { + std::vector< std::vector >* writerVectors) { for (std::deque::const_iterator it = ops.begin(); it != ops.end(); @@ -335,12 +336,6 @@ namespace repl { const char* opType = it->getField( "op" ).valuestrsafe(); - // Check if any entry needs journaling, and if so return the need - const bool foundJournal = it->getField("j").trueValue(); - if (foundJournal) { - *mustAwaitCommit = true; - } - if (getGlobalServiceContext()->getGlobalStorageEngine()->supportsDocLocking() && isCrudOpType(opType)) { BSONElement id; diff --git a/src/mongo/db/repl/sync_tail.h b/src/mongo/db/repl/sync_tail.h index b2bad4c48280a..8d0f79ced7c15 100644 --- a/src/mongo/db/repl/sync_tail.h +++ b/src/mongo/db/repl/sync_tail.h @@ -128,11 +128,8 @@ namespace repl { // Doles out all the work to the writer pool threads and waits for them to complete void applyOps(const std::vector< std::vector >& writerVectors); - // mustAwaitCommit is an out-parameter and indicates that at least one of the ops - // in 'ops' had j:true. - void fillWriterVectors(const std::deque& ops, - std::vector< std::vector >* writerVectors, - bool* mustAwaitCommit); + void fillWriterVectors(const std::deque& ops, + std::vector< std::vector >* writerVectors); void handleSlaveDelay(const BSONObj& op); // persistent pool of worker threads for writing ops to the databases diff --git a/src/mongo/db/write_concern.cpp b/src/mongo/db/write_concern.cpp index a16352b34fc8a..7bb8edd0fa2f5 100644 --- a/src/mongo/db/write_concern.cpp +++ b/src/mongo/db/write_concern.cpp @@ -62,6 +62,18 @@ namespace mongo { } } + namespace { + // The consensus protocol requires that w: majority implies j: true on all nodes. + void addJournalSyncForWMajority(WriteConcernOptions* writeConcern) { + if (repl::getGlobalReplicationCoordinator()->isV1ElectionProtocol() + && writeConcern->wMode == WriteConcernOptions::kMajority + && writeConcern->syncMode == WriteConcernOptions::NONE) + { + writeConcern->syncMode = WriteConcernOptions::JOURNAL; + } + } + } // namespace + StatusWith extractWriteConcern(const BSONObj& cmdObj) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 @@ -70,6 +82,8 @@ namespace mongo { if (writeConcern.wNumNodes == 0 && writeConcern.wMode.empty()) { writeConcern.wNumNodes = 1; } + // Upgrade default write concern if necessary. + addJournalSyncForWMajority(&writeConcern); BSONElement writeConcernElement; Status wcStatus = bsonExtractTypedField(cmdObj, @@ -100,6 +114,9 @@ namespace mongo { return wcStatus; } + // Upgrade parsed write concern if necessary. + addJournalSyncForWMajority(&writeConcern); + return writeConcern; } @@ -130,7 +147,7 @@ namespace mongo { if ( replMode != repl::ReplicationCoordinator::modeReplSet && !writeConcern.wMode.empty() && - writeConcern.wMode != "majority" ) { + writeConcern.wMode != WriteConcernOptions::kMajority ) { return Status( ErrorCodes::BadValue, string( "cannot use non-majority 'w' mode " ) + writeConcern.wMode + " when a host is not a member of a replica set" ); diff --git a/src/mongo/db/write_concern_options.cpp b/src/mongo/db/write_concern_options.cpp index 50bf9cad04063..4b744fedd4586 100644 --- a/src/mongo/db/write_concern_options.cpp +++ b/src/mongo/db/write_concern_options.cpp @@ -39,6 +39,8 @@ namespace mongo { const BSONObj WriteConcernOptions::Acknowledged(BSON("w" << W_NORMAL)); const BSONObj WriteConcernOptions::Unacknowledged(BSON("w" << W_NONE)); + const char WriteConcernOptions::kMajority[] = "majority"; + static const BSONField mongosSecondaryThrottleField("_secondaryThrottle", true); static const BSONField secondaryThrottleField("secondaryThrottle", true); static const BSONField writeConcernField("writeConcern"); diff --git a/src/mongo/db/write_concern_options.h b/src/mongo/db/write_concern_options.h index 445b6fab1db34..6b3cee6d147de 100644 --- a/src/mongo/db/write_concern_options.h +++ b/src/mongo/db/write_concern_options.h @@ -46,6 +46,8 @@ namespace mongo { static const BSONObj Acknowledged; static const BSONObj Unacknowledged; + static const char kMajority[]; // = "majority" + WriteConcernOptions() { reset(); } WriteConcernOptions(int numNodes, diff --git a/src/mongo/s/d_migrate.cpp b/src/mongo/s/d_migrate.cpp index ae319e1006a50..481b03ecc79e6 100644 --- a/src/mongo/s/d_migrate.cpp +++ b/src/mongo/s/d_migrate.cpp @@ -2508,7 +2508,7 @@ namespace mongo { const WriteConcernOptions& writeConcern) { WriteConcernOptions majorityWriteConcern; majorityWriteConcern.wTimeout = -1; - majorityWriteConcern.wMode = "majority"; + majorityWriteConcern.wMode = WriteConcernOptions::kMajority; Status majorityStatus = repl::getGlobalReplicationCoordinator()->awaitReplication( txn, lastOpApplied, majorityWriteConcern).status;