From 59d341f677f355939c6f4e8e9934ea1de700c1f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Jos=C3=A9=20Grillo=20Ramirez?= Date: Mon, 17 Jan 2022 17:29:25 +0000 Subject: [PATCH] SERVER-61760 Stop migrations in sharded collections before executing a collMod command --- .../ddl_ops_reported_on_current_op_command.js | 21 ++++++ .../sharding/libs/resharding_test_fixture.js | 14 +++- .../sharding/resharding_disallow_writes.js | 5 ++ src/mongo/db/s/collmod_coordinator.cpp | 17 ++++- .../db/s/collmod_coordinator_document.idl | 4 ++ src/mongo/db/s/sharding_ddl_util.cpp | 68 ++++++++++++------- src/mongo/db/s/sharding_ddl_util.h | 16 ++++- 7 files changed, 114 insertions(+), 31 deletions(-) diff --git a/jstests/sharding/ddl_ops_reported_on_current_op_command.js b/jstests/sharding/ddl_ops_reported_on_current_op_command.js index e717246ef0583..ae73960b7761d 100644 --- a/jstests/sharding/ddl_ops_reported_on_current_op_command.js +++ b/jstests/sharding/ddl_ops_reported_on_current_op_command.js @@ -116,6 +116,27 @@ if (jsTestOptions().useRandomBinVersionsWithinReplicaSet || jsTestOptions().shar assert(currOp[0].command.hasOwnProperty('allowMigrations')); assert.eq(true, currOp[0].command.allowMigrations); } + + { + jsTestLog('Check collmod shows in current op'); + + let ddlOpThread = new Thread((mongosConnString, db, coll, nss) => { + let mongos = new Mongo(mongosConnString); + mongos.getCollection(nss).runCommand( + {createIndexes: coll, indexes: [{key: {c: 1}, name: "c_1"}]}); + mongos.getDB(db).runCommand({collMod: coll, validator: {}}); + }, st.s0.host, kDbName, kCollectionName, nss); + + let currOp = getCurrentOpOfDDL(ddlOpThread, 'CollModCoordinator'); + + // There must be one operation running with the appropiate ns. + assert.eq(1, currOp.length); + assert.eq(nss, currOp[0].ns); + assert(currOp[0].hasOwnProperty('command')); + jsTestLog(tojson(currOp[0].command)); + assert(currOp[0].command.hasOwnProperty('validator')); + assert.docEq({}, currOp[0].command.validator); + } } { diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js index d04edd9e5fea4..099917d3bb0b1 100644 --- a/jstests/sharding/libs/resharding_test_fixture.js +++ b/jstests/sharding/libs/resharding_test_fixture.js @@ -377,13 +377,18 @@ var ReshardingTest = class { * @param postDecisionPersistedFn - a function for evaluating addition assertions after * the decision has been persisted, but before the resharding operation finishes and returns * to the client. + * + * @param afterReshardingFn - a function that will be called after the resharding operation + * finishes but before checking the the state post resharding. By the time afterReshardingFn + * is called the temporary resharding collection will either have been dropped or renamed. */ withReshardingInBackground({newShardKeyPattern, newChunks}, duringReshardingFn = (tempNs) => {}, { expectedErrorCode = ErrorCodes.OK, postCheckConsistencyFn = (tempNs) => {}, - postDecisionPersistedFn = () => {} + postDecisionPersistedFn = () => {}, + afterReshardingFn = () => {} } = {}) { this._startReshardingInBackgroundAndAllowCommandFailure({newShardKeyPattern, newChunks}, expectedErrorCode); @@ -396,7 +401,8 @@ var ReshardingTest = class { this._callFunctionSafely(() => duringReshardingFn(this._tempNs)); this._checkConsistencyAndPostState(expectedErrorCode, () => postCheckConsistencyFn(this._tempNs), - () => postDecisionPersistedFn()); + () => postDecisionPersistedFn(), + () => afterReshardingFn()); } /** @private */ @@ -500,7 +506,8 @@ var ReshardingTest = class { /** @private */ _checkConsistencyAndPostState(expectedErrorCode, postCheckConsistencyFn = () => {}, - postDecisionPersistedFn = () => {}) { + postDecisionPersistedFn = () => {}, + afterReshardingFn = () => {}) { let performCorrectnessChecks = true; if (expectedErrorCode === ErrorCodes.OK) { this._callFunctionSafely(() => { @@ -566,6 +573,7 @@ var ReshardingTest = class { expectedErrorCode: expectedErrorCode }); + afterReshardingFn(); this._checkPostState(expectedErrorCode); } diff --git a/jstests/sharding/resharding_disallow_writes.js b/jstests/sharding/resharding_disallow_writes.js index a403a58a99a73..b7a4aaa124d5b 100644 --- a/jstests/sharding/resharding_disallow_writes.js +++ b/jstests/sharding/resharding_disallow_writes.js @@ -71,6 +71,7 @@ reshardingTest.withReshardingInBackground( assert(ErrorCodes.isExceededTimeLimitError(res.code)); jsTestLog("Attempting collMod"); + assert.commandFailedWithCode( // The collMod is serialized with the resharding command, so we explicitly add an // timeout to the command so that it doesn't get blocked and timeout the test. @@ -83,6 +84,10 @@ reshardingTest.withReshardingInBackground( ErrorCodes.ReshardCollectionInProgress); jsTestLog("Completed operations"); + }, + afterReshardingFn: (tempNS) => { + jsTestLog("Join possible ongoing collMod command"); + assert.commandWorked(sourceCollection.runCommand("collMod")); } }); diff --git a/src/mongo/db/s/collmod_coordinator.cpp b/src/mongo/db/s/collmod_coordinator.cpp index 9c13e2756a50e..1b379b0d4cf5a 100644 --- a/src/mongo/db/s/collmod_coordinator.cpp +++ b/src/mongo/db/s/collmod_coordinator.cpp @@ -167,10 +167,15 @@ ExecutorFuture CollModCoordinator::_runImpl( << "Cannot update granularity of a sharded time-series collection.", !hasTimeSeriesGranularityUpdate(_doc.getCollModRequest())); } + _doc.setCollUUID( + sharding_ddl_util::getCollectionUUID(opCtx, nss(), true /* allowViews */)); - if (_recoveredFromDisk) { + sharding_ddl_util::stopMigrations(opCtx, nss(), _doc.getCollUUID()); + + if (!_firstExecution) { _performNoopRetryableWriteOnParticipants(opCtx, **executor); } + _doc = _updateSession(opCtx, _doc); const OperationSessionInfo osi = getCurrentSession(_doc); @@ -201,6 +206,7 @@ ExecutorFuture CollModCoordinator::_runImpl( CommandHelpers::appendSimpleCommandStatus(builder, ok, errmsg); } _result = builder.obj(); + sharding_ddl_util::resumeMigrations(opCtx, nss(), _doc.getCollUUID()); } else { CollMod cmd(nss()); cmd.setCollModRequest(_doc.getCollModRequest()); @@ -228,6 +234,15 @@ ExecutorFuture CollModCoordinator::_runImpl( "Error running collMod", "namespace"_attr = nss(), "error"_attr = redact(status)); + // If we have the collection UUID set, this error happened in a sharded collection, + // we should restore the migrations. + if (_doc.getCollUUID()) { + auto opCtxHolder = cc().makeOperationContext(); + auto* opCtx = opCtxHolder.get(); + getForwardableOpMetadata().setOn(opCtx); + + sharding_ddl_util::resumeMigrations(opCtx, nss(), _doc.getCollUUID()); + } } return status; }); diff --git a/src/mongo/db/s/collmod_coordinator_document.idl b/src/mongo/db/s/collmod_coordinator_document.idl index 8ff37dc630872..9c0aaacbf98b7 100644 --- a/src/mongo/db/s/collmod_coordinator_document.idl +++ b/src/mongo/db/s/collmod_coordinator_document.idl @@ -61,3 +61,7 @@ structs: collModRequest: type: CollModRequest description: "Initial collMod request." + collUUID: + type: uuid + description: "Collection uuid." + optional: true diff --git a/src/mongo/db/s/sharding_ddl_util.cpp b/src/mongo/db/s/sharding_ddl_util.cpp index 2d4dc68a02b36..4d6eb5e80163a 100644 --- a/src/mongo/db/s/sharding_ddl_util.cpp +++ b/src/mongo/db/s/sharding_ddl_util.cpp @@ -146,6 +146,35 @@ write_ops::UpdateCommandRequest buildNoopWriteRequestCommand() { return updateOp; } +void setAllowMigrations(OperationContext* opCtx, + const NamespaceString& nss, + const boost::optional& expectedCollectionUUID, + bool allowMigrations) { + ConfigsvrSetAllowMigrations configsvrSetAllowMigrationsCmd(nss, allowMigrations); + configsvrSetAllowMigrationsCmd.setCollectionUUID(expectedCollectionUUID); + + const auto swSetAllowMigrationsResult = + Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( + opCtx, + ReadPreferenceSetting{ReadPreference::PrimaryOnly}, + NamespaceString::kAdminDb.toString(), + CommandHelpers::appendMajorityWriteConcern(configsvrSetAllowMigrationsCmd.toBSON({})), + Shard::RetryPolicy::kIdempotent // Although ConfigsvrSetAllowMigrations is not really + // idempotent (because it will cause the collection + // version to be bumped), it is safe to be retried. + ); + try { + uassertStatusOKWithContext( + Shard::CommandResponse::getEffectiveStatus(std::move(swSetAllowMigrationsResult)), + str::stream() << "Error setting allowMigrations to " << allowMigrations + << " for collection " << nss.toString()); + } catch (const ExceptionFor&) { + // Collection no longer exists + } catch (const ExceptionFor&) { + // Collection metadata was concurrently dropped + } +} + } // namespace void linearizeCSRSReads(OperationContext* opCtx) { @@ -404,34 +433,23 @@ boost::optional checkIfCollectionAlreadySharded( void stopMigrations(OperationContext* opCtx, const NamespaceString& nss, const boost::optional& expectedCollectionUUID) { - ConfigsvrSetAllowMigrations configsvrSetAllowMigrationsCmd(nss, false /* allowMigrations */); - configsvrSetAllowMigrationsCmd.setCollectionUUID(expectedCollectionUUID); - - const auto swSetAllowMigrationsResult = - Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( - opCtx, - ReadPreferenceSetting{ReadPreference::PrimaryOnly}, - NamespaceString::kAdminDb.toString(), - CommandHelpers::appendMajorityWriteConcern(configsvrSetAllowMigrationsCmd.toBSON({})), - Shard::RetryPolicy::kIdempotent // Although ConfigsvrSetAllowMigrations is not really - // idempotent (because it will cause the collection - // version to be bumped), it is safe to be retried. - ); + setAllowMigrations(opCtx, nss, expectedCollectionUUID, false); +} - try { - uassertStatusOKWithContext( - Shard::CommandResponse::getEffectiveStatus(std::move(swSetAllowMigrationsResult)), - str::stream() << "Error setting allowMigrations to false for collection " - << nss.toString()); - } catch (const ExceptionFor&) { - // Collection no longer exists - } catch (const ExceptionFor&) { - // Collection metadata was concurrently dropped - } +void resumeMigrations(OperationContext* opCtx, + const NamespaceString& nss, + const boost::optional& expectedCollectionUUID) { + setAllowMigrations(opCtx, nss, expectedCollectionUUID, true); } -boost::optional getCollectionUUID(OperationContext* opCtx, const NamespaceString& nss) { - AutoGetCollection autoColl(opCtx, nss, MODE_IS, AutoGetCollectionViewMode::kViewsForbidden); +boost::optional getCollectionUUID(OperationContext* opCtx, + const NamespaceString& nss, + bool allowViews) { + AutoGetCollection autoColl(opCtx, + nss, + MODE_IS, + allowViews ? AutoGetCollectionViewMode::kViewsPermitted + : AutoGetCollectionViewMode::kViewsForbidden); return autoColl ? boost::make_optional(autoColl->uuid()) : boost::none; } diff --git a/src/mongo/db/s/sharding_ddl_util.h b/src/mongo/db/s/sharding_ddl_util.h index ec198cc7ad688..1cfba12be74c5 100644 --- a/src/mongo/db/s/sharding_ddl_util.h +++ b/src/mongo/db/s/sharding_ddl_util.h @@ -138,16 +138,28 @@ boost::optional checkIfCollectionAlreadySharded( /** * Stops ongoing migrations and prevents future ones to start for the given nss. * If expectedCollectionUUID is set and doesn't match that of that collection, then this is a no-op. + * If expectedCollectionUUID is not set, no UUID check will be performed before stopping migrations. */ void stopMigrations(OperationContext* opCtx, const NamespaceString& nss, const boost::optional& expectedCollectionUUID); +/** + * Resume migrations and balancing rounds for the given nss. + * If expectedCollectionUUID is set and doesn't match that of the collection, then this is a no-op. + * If expectedCollectionUUID is not set, no UUID check will be performed before resuming migrations. + */ +void resumeMigrations(OperationContext* opCtx, + const NamespaceString& nss, + const boost::optional& expectedCollectionUUID); + /* * Returns the UUID of the collection (if exists) using the catalog. It does not provide any locking - *guarantees. + * guarantees after the call. **/ -boost::optional getCollectionUUID(OperationContext* opCtx, const NamespaceString& nss); +boost::optional getCollectionUUID(OperationContext* opCtx, + const NamespaceString& nss, + bool allowViews = false); /* * Performs a noop retryable write on the given shards using the session and txNumber specified in