Skip to content

Commit

Permalink
Slave should always switch to SLAVE for the most recently selected ma…
Browse files Browse the repository at this point in the history
…ster

When a slave get a masterIsAvailable message, it will try to go to SLAVE for the selected master. However if during its state change from TO_SLAVE to SLAVE, the slave get a new masterIsAvailable message from another master (the first master fails and another master is selected), the slave should reset the master id and try to switch to SLAVE for the newly selected master.

The state change from TO_SLAVE to SLAVE is done by a scheduled executor in a separate thread. So whenever we receive masterIsAvailable message, we should always make sure that the executor could see this master change.

This pr fix a rare bug where a slave fails to come online because it enters an infinite loop to switch to SLAVE for a wrong/stale master. The bug was seen if the messages came to a slave with the following order:
Got coordinator(1)       | SLAVE    -> SLAVE    | electedMasterId = 1
Got coordinator(2)       | SLAVE    -> PENDING  | electedMasterId = 2
Got masterIsAvailable(1) | PENDING  -> TO_SLAVE | availableMasterId = 1 and switchToSlave
Got masterIsAvailable(2) | TO_SLAVE -> TO_SLAVE | availableMasterId = 2
  • Loading branch information
Zhen committed Feb 24, 2015
1 parent f4320df commit 99469b3
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 5 deletions.
Expand Up @@ -316,17 +316,16 @@ private void switchToSlave() throws ExecutionException, InterruptedException
{
// Do this with a scheduler, so that if it fails, it can retry later with an exponential backoff with max
// wait time.
final URI masterUri = availableMasterId;
/*
* This is purely defensive and should never trigger. There was a race where the switch to slave task would
* start after this instance was elected master and the task would constantly try to change as slave
* for itself, never cancelling. This now should not be possible, since we cancel the task and wait for it
* to complete, all in a single thread executor. However, this is a check worth doing because if this
* condition slips through via some other code path it can cause trouble.
*/
if ( getServerId( masterUri ).equals( getServerId( me ) ) )
if ( getServerId( availableMasterId ).equals( getServerId( me ) ) )
{
msgLog.error( "I (" + me + ") tried to switch to slave for myself as master (" + masterUri + ")" );
msgLog.error( "I (" + me + ") tried to switch to slave for myself as master (" + availableMasterId + ")" );
return;
}
final AtomicLong wait = new AtomicLong();
Expand All @@ -346,7 +345,9 @@ public void run()
haCommunicationLife.shutdown();
haCommunicationLife = new LifeSupport();

URI resultingSlaveHaURI = switchToSlave.switchToSlave( haCommunicationLife, me, masterUri, cancellationHandle );
// it is important for availableMasterId to be re-read on every attempt so that
// slave switching would not result in an infinite loop with wrong/stale availableMasterId
URI resultingSlaveHaURI = switchToSlave.switchToSlave( haCommunicationLife, me, availableMasterId, cancellationHandle );
if ( resultingSlaveHaURI == null )
{
/*
Expand Down
Expand Up @@ -20,8 +20,11 @@
package org.neo4j.kernel.ha.cluster;

import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
Expand All @@ -30,22 +33,30 @@
import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.doAnswer;

import static org.neo4j.kernel.ha.cluster.HighAvailabilityMemberState.PENDING;
import static org.neo4j.kernel.ha.cluster.HighAvailabilityMemberState.TO_SLAVE;

import java.net.URI;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import org.junit.Test;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import org.mockito.InOrder;

import org.neo4j.cluster.InstanceId;
import org.neo4j.cluster.member.ClusterMemberAvailability;
import org.neo4j.cluster.protocol.election.Election;
import org.neo4j.com.ComException;
import org.neo4j.helpers.CancellationRequest;
import org.neo4j.kernel.impl.util.StringLogger;
import org.neo4j.kernel.lifecycle.LifeSupport;
Expand Down Expand Up @@ -150,7 +161,7 @@ public void shouldNotBroadcastIfMasterAndReceivesSlaveIsAvailable() throws Excep
}

@Test
public void shouldReswitchToSlaveIfNewMasterBecameAvailableDuringSwitch() throws Throwable
public void shouldReswitchToSlaveIfNewMasterBecameElectedAndAvailableDuringSwitch() throws Throwable
{
// Given
final CountDownLatch switching = new CountDownLatch( 1 );
Expand Down Expand Up @@ -211,6 +222,103 @@ public URI answer( InvocationOnMock invocationOnMock ) throws Throwable
slaveAvailable.await();
}

@Test
public void shouldRecognizeNewMasterIfNewMasterBecameAvailableDuringSwitch() throws Throwable
{
// When messages coming in the following ordering, the slave should detect that the master id has changed
// M1: Get masterIsAvailable for instance 1 at PENDING state, changing PENDING -> TO_SLAVE
// M2: Get masterIsAvailable for instance 2 at TO_SLAVE state, changing TO_SLAVE -> TO_SLAVE

System.gc();
// Given
final CountDownLatch firstMasterAvailableHandled = new CountDownLatch( 1 );
final CountDownLatch secondMasterAvailableComes = new CountDownLatch( 1 );
final CountDownLatch secondMasterAvailableHandled = new CountDownLatch( 1 );

SwitchToSlave switchToSlave = mock( SwitchToSlave.class );

HighAvailabilityModeSwitcher toTest = new HighAvailabilityModeSwitcher( switchToSlave,
mock( SwitchToMaster.class ), mock( Election.class ), mock( ClusterMemberAvailability.class ),
StringLogger.DEV_NULL )
{
@Override
ScheduledExecutorService createExecutor()
{
final ScheduledExecutorService executor = mock( ScheduledExecutorService.class );
final ExecutorService realExecutor = Executors.newSingleThreadExecutor();

when( executor.submit( any( Runnable.class ) ) ).thenAnswer( new Answer<Future<?>>()
{
@Override
public Future<?> answer( final InvocationOnMock invocation ) throws Throwable
{
return realExecutor.submit( new Runnable() {
@Override
public void run()
{
((Runnable) invocation.getArguments()[0]).run();
}
});
}
} );

when( executor.schedule( any( Runnable.class ), anyLong(), any( TimeUnit.class ) ) ).thenAnswer(
new Answer<Future<?>>()
{
@Override
public Future<?> answer( final InvocationOnMock invocation ) throws Throwable
{
realExecutor.submit( new Callable<Void>()
{
@Override
public Void call() throws Exception
{
firstMasterAvailableHandled.countDown();

// wait until the second masterIsAvailable comes and then call switchToSlave method
secondMasterAvailableComes.await();
((Runnable) invocation.getArguments()[0]).run();
secondMasterAvailableHandled.countDown();
return null;
};
} );
return mock( ScheduledFuture.class );
}
} );
return executor;
}
};
toTest.init();
toTest.start();
toTest.listeningAt( URI.create( "ha://server3?serverId=3" ) );

// When

// masterIsAvailable for instance 1
URI uri1 = URI.create( "ha://server1" );
// The first masterIsAvailable should fail so that the slave instance stops at TO_SLAVE state
doThrow( new ComException( "Fail to switch to slave and reschedule to retry" ) )
.when( switchToSlave )
.switchToSlave( any( LifeSupport.class ), any( URI.class ), eq( uri1 ), any( CancellationRequest.class ) );

toTest.masterIsAvailable( new HighAvailabilityMemberChangeEvent( PENDING, TO_SLAVE, new InstanceId( 1 ), uri1 ) );
firstMasterAvailableHandled.await(); // wait until the first masterIsAvailable triggers the exception handling process
verify( switchToSlave ).switchToSlave( any( LifeSupport.class ), any( URI.class ), eq( uri1 ),
any( CancellationRequest.class ) );


// masterIsAvailable for instance 2
URI uri2 = URI.create( "ha://server2" );
toTest.masterIsAvailable( new HighAvailabilityMemberChangeEvent( TO_SLAVE, TO_SLAVE, new InstanceId( 2 ), uri2 ) );
secondMasterAvailableComes.countDown();
secondMasterAvailableHandled.await(); // wait until switchToSlave method is invoked again

// Then
// switchToSlave should be retried with new master id
verify( switchToSlave ).switchToSlave( any( LifeSupport.class ), any( URI.class ), eq( uri2 ),
any( CancellationRequest.class ) );
}

@Test
public void shouldTakeNoActionIfSwitchingToSlaveForItselfAsMaster() throws Throwable
{
Expand Down

0 comments on commit 99469b3

Please sign in to comment.