Skip to content

Commit

Permalink
SERVER-7271 Do not exit if a transient config server error aborts a m…
Browse files Browse the repository at this point in the history
…igration.
  • Loading branch information
Alberto Lerner committed Apr 1, 2013
1 parent d2febea commit 356f8a7
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 19 deletions.
49 changes: 47 additions & 2 deletions src/mongo/client/distlock.cpp
Expand Up @@ -488,12 +488,57 @@ namespace mongo {
return true;
}


bool DistributedLock::isLockHeld( double timeout, string* errMsg ) {
ScopedDbConnection conn(_conn.toString(), timeout );

BSONObj lockObj;
try {
lockObj = conn->findOne( LocksType::ConfigNS,
BSON( LocksType::name(_name) ) ).getOwned();
}
catch ( DBException& e ) {
*errMsg = str::stream() << "error checking whether lock " << _name << " is held "
<< causedBy( e );
return false;
}
conn.done();

if ( lockObj.isEmpty() ) {
*errMsg = str::stream() << "could not assert if lock " << _name << " "
<< "was held because there was correspondant document in the "
<< "locks collection";
return false;
}

if ( lockObj[LocksType::state()].numberInt() < 2 ) {
*errMsg = str::stream() << "lock " << _name << " is not held because its current "
<< "state is " << lockObj[LocksType::state()].numberInt();
return false;
}

if ( lockObj[LocksType::process()].String() != _processId ) {
*errMsg = str::stream() << "lock " << _name << " is currently being held by "
<< " another process " << lockObj[LocksType::process()].String();
return false;
}

if ( distLockPinger.willUnlockOID( lockObj[LocksType::lockID()].OID() ) ) {
*errMsg = str::stream() << "lock " << _name << " is not held and is currently being "
<< "scheduled for lazy unlock by "
<< lockObj[LocksType::lockID()].OID();
return false;
}

return true;
}

// Semantics of this method are basically that if the lock cannot be acquired, returns false, can be retried.
// If the lock should not be tried again (some unexpected error) a LockException is thrown.
// If we are only trying to re-enter a currently held lock, reenter should be true.
// Note: reenter doesn't actually make this lock re-entrant in the normal sense, since it can still only
// be unlocked once, instead it is used to verify that the lock is already held.
bool DistributedLock::lock_try( const string& why , bool reenter, BSONObj * other ) {
bool DistributedLock::lock_try( const string& why , bool reenter, BSONObj * other, double timeout ) {

// TODO: Start pinging only when we actually get the lock?
// If we don't have a thread pinger, make sure we shouldn't have one
Expand All @@ -515,7 +560,7 @@ namespace mongo {
if ( other == NULL )
other = &dummyOther;

ScopedDbConnection conn(_conn.toString());
ScopedDbConnection conn(_conn.toString(), timeout );

BSONObjBuilder queryBuilder;
queryBuilder.append( LocksType::name() , _name );
Expand Down
37 changes: 26 additions & 11 deletions src/mongo/client/distlock.h
Expand Up @@ -123,7 +123,15 @@ namespace mongo {
* details if not
* @return true if it managed to grab the lock
*/
bool lock_try( const string& why , bool reenter = false, BSONObj * other = 0 );
bool lock_try( const string& why , bool reenter = false, BSONObj * other = 0, double timeout = 0.0 );

/**
* Returns true if we currently believe we hold this lock and it was possible to
* confirm that, within 'timeout' seconds, if provided, with the config servers. If the
* lock is not held or if we failed to contact the config servers within the timeout,
* returns false.
*/
bool isLockHeld( double timeout, string* errMsg );

/**
* Releases a previously taken lock.
Expand Down Expand Up @@ -223,9 +231,9 @@ namespace mongo {
return *this;
}

dist_lock_try( DistributedLock * lock , const std::string& why )
dist_lock_try( DistributedLock * lock , const std::string& why, double timeout = 0.0 )
: _lock(lock), _why(why) {
_got = _lock->lock_try( why , false , &_other );
_got = _lock->lock_try( why , false , &_other, timeout );
}

~dist_lock_try() {
Expand All @@ -235,16 +243,23 @@ namespace mongo {
}
}

bool reestablish(){
return retry();
}
/**
* Returns false if the lock is known _not_ to be held, otherwise asks the underlying
* lock to issue a 'isLockHeld' call and returns whatever that calls does.
*/
bool isLockHeld( double timeout, string* errMsg) {
if ( !_lock ) {
*errMsg = "Lock is not currently set up";
return false;
}

bool retry() {
verify( _lock );
verify( _got );
verify( ! _other.isEmpty() );
if ( !_got ) {
*errMsg = str::stream() << "Lock " << _lock->_name << " is currently held by "
<< _other;
return false;
}

return _got = _lock->lock_try( _why , true, &_other );
return _lock->isLockHeld( timeout, errMsg );
}

bool got() const { return _got; }
Expand Down
2 changes: 1 addition & 1 deletion src/mongo/client/syncclusterconnection.cpp
Expand Up @@ -181,7 +181,7 @@ namespace mongo {
if ( lockType > 0 ) { // write $cmd
string errmsg;
if ( ! prepare( errmsg ) )
throw UserException( 13104 , (string)"SyncClusterConnection::findOne prepare failed: " + errmsg );
throw UserException( PrepareConfigsFailedCode , (string)"SyncClusterConnection::findOne prepare failed: " + errmsg );

vector<BSONObj> all;
for ( size_t i=0; i<_conns.size(); i++ ) {
Expand Down
46 changes: 44 additions & 2 deletions src/mongo/s/d_migrate.cpp
Expand Up @@ -57,6 +57,7 @@
#include "mongo/s/d_logic.h"
#include "mongo/s/shard.h"
#include "mongo/s/type_chunk.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/elapsed_tracker.h"
#include "mongo/util/processinfo.h"
#include "mongo/util/queue.h"
Expand Down Expand Up @@ -998,7 +999,7 @@ namespace mongo {
dist_lock_try dlk;

try{
dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString() );
dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString(), 30.0 /*timeout*/ );
}
catch( LockException& e ){
errmsg = str::stream() << "error locking distributed lock for migration " << "migrate-" << min.toString() << causedBy( e );
Expand Down Expand Up @@ -1208,6 +1209,22 @@ namespace mongo {
timing.done(4);

// 5.

// Before we get into the critical section of the migration, let's double check
// that the config servers are reachable and the lock is in place.
log() << "About to check if it is safe to enter critical section";

string lockHeldMsg;
bool lockHeld = dlk.isLockHeld( 30.0 /* timeout */, &lockHeldMsg );
if ( !lockHeld ) {
errmsg = str::stream() << "not entering migrate critical section because "
<< lockHeldMsg;
warning() << errmsg << endl;
return false;
}

log() << "About to enter migrate critical section";

{
// 5.a
// we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state
Expand Down Expand Up @@ -1378,6 +1395,7 @@ namespace mongo {
BSONObj cmd = cmdBuilder.obj();
LOG(7) << "moveChunk update: " << cmd << migrateLog;

int exceptionCode = OkCode;
bool ok = false;
BSONObj cmdResult;
try {
Expand All @@ -1388,12 +1406,36 @@ namespace mongo {
catch ( DBException& e ) {
warning() << e << migrateLog;
ok = false;
exceptionCode = e.getCode();
BSONObjBuilder b;
e.getInfo().append( b );
cmdResult = b.obj();
}

if ( ! ok ) {
if ( exceptionCode == PrepareConfigsFailedCode ) {

// In the process of issuing the migrate commit, the SyncClusterConnection
// checks that the config servers are reachable. If they are not, we are
// sure that the applyOps command was not sent to any of the configs, so we
// can safely back out of the migration here, by resetting the shard
// version that we bumped up to in the donateChunk() call above.

log() << "About to acquire moveChunk global lock to reset shard version from "
<< "failed migration" << endl;

{
Lock::GlobalWrite lk;

// Revert the chunk manager back to the state before "forgetting"
// about the chunk.
shardingState.undoDonateChunk( ns , min , max , startingVersion );
}

log() << "Shard version successfully reset to clean up failed migration" << endl;
return false;

}
else if ( ! ok || exceptionCode != OkCode ) {

// this could be a blip in the connectivity
// wait out a few seconds and check if the commit request made it
Expand Down
8 changes: 5 additions & 3 deletions src/mongo/util/assert_util.h
Expand Up @@ -28,9 +28,11 @@
namespace mongo {

enum CommonErrorCodes {
DatabaseDifferCaseCode = 13297 ,
SendStaleConfigCode = 13388 ,
RecvStaleConfigCode = 9996
OkCode = 0,
DatabaseDifferCaseCode = 13297 , // uassert( 13297 )
SendStaleConfigCode = 13388 , // uassert( 13388 )
RecvStaleConfigCode = 9996, // uassert( 9996 )
PrepareConfigsFailedCode = 13104 // uassert( 13104 )
};

class AssertionCount {
Expand Down

0 comments on commit 356f8a7

Please sign in to comment.