Skip to content

Commit

Permalink
Improved deadlock resolution strategy for enterprise lock manager.
Browse files Browse the repository at this point in the history
When a deadlock is detected, it's rather likely that multiple participants
in the deadlock realize at the same time that they are deadlocking. Because
we want to minimize aborts, we have code that chooses which of the
deadlocking parties should bail out.

This code is very sensitive, because if there is ever a code path where both
parties think the other will abort, we would end up in a real deadlock,
rather than averting it.

Currently, the strategy has been to look at wait lists - the client with the
highest number of waiting clients is allowed to continue, while the other
client aborts. However, this strategy is sub-optimal, partially because
the data we have on number of waiting clients is unreliable, but mainly
because there was no good tie breaker if both clients have the sized wait
lists - if both clients have the same number of clients waiting for them,
both will abort.

This modifies this, while hopefully making the code a bit clearer:

 - There is now a tie breaker, if both clients are on the same "level"
   we abort based on client id, ensuring only one client aborts.
 - There is now two strategies, chosen with a feature toggle, to allow
   real-world trialing of the two prominent deadlock resolution strategies.
   The two strategies introduced are both based on counts of locks held,
   rather than counts of current wait lists:

   - ABORT_YOUNG, assumes that higher throughput can be achieved by letting
                  transactions with more locks held (presumed older) finish.
                  Young transactions with fewer locks abort.
   - ABORT_OLD, assumes old monolithic transactions are "holding up the
                line", and that higher throughput can be achieved by
                aborting transactions with more locks.

This sets a nice precedent for introducing transaction priorities, where we
could choose to abort transactions with lower priorities. Something to
consider for future improvements.
  • Loading branch information
jakewins committed Feb 17, 2016
1 parent 42e4ec8 commit c8fec4c
Show file tree
Hide file tree
Showing 8 changed files with 391 additions and 222 deletions.

This file was deleted.

Expand Up @@ -28,6 +28,7 @@
import org.junit.rules.ExpectedException; import org.junit.rules.ExpectedException;


import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future; import java.util.concurrent.Future;


import org.neo4j.graphdb.ConstraintViolationException; import org.neo4j.graphdb.ConstraintViolationException;
Expand All @@ -53,8 +54,10 @@
import static java.lang.System.currentTimeMillis; import static java.lang.System.currentTimeMillis;
import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.concurrent.TimeUnit.SECONDS;
import static org.hamcrest.core.IsInstanceOf.instanceOf;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
import static org.neo4j.helpers.collection.IteratorUtil.first; import static org.neo4j.helpers.collection.IteratorUtil.first;
import static org.neo4j.kernel.impl.ha.ClusterManager.allSeesAllAsAvailable; import static org.neo4j.kernel.impl.ha.ClusterManager.allSeesAllAsAvailable;
Expand Down Expand Up @@ -286,21 +289,20 @@ private void deadlockDetectionBetween( HighlyAvailableGraphDatabase slave1, fina
tx1.acquireReadLock( commonNode ); tx1.acquireReadLock( commonNode );
thread2.execute( new AcquireReadLockOnReferenceNode( tx2, commonNode ) ); thread2.execute( new AcquireReadLockOnReferenceNode( tx2, commonNode ) );
// -- and one of them wanting (and awaiting) to upgrade its read lock to a write lock // -- and one of them wanting (and awaiting) to upgrade its read lock to a write lock
Future<Lock> writeLockFuture = thread2.executeDontWait( new AcquireWriteLock( tx2, new Callable<Node>(){ Future<Lock> writeLockFuture = thread2.executeDontWait( state -> {
@Override try( Transaction ignored = tx2 ) // Close transaction no matter what happens
public Node call() throws Exception
{ {
return commonNode; return tx2.acquireWriteLock( commonNode );
} }
} ) ); } );


for ( int i = 0; i < 10; i++ ) for ( int i = 0; i < 10; i++ )
{ {
thread2.waitUntilThreadState( Thread.State.TIMED_WAITING, Thread.State.WAITING ); thread2.waitUntilThreadState( Thread.State.TIMED_WAITING, Thread.State.WAITING );
Thread.sleep(2); Thread.sleep(2);
} }


try try( Transaction ignored = tx1 ) // Close transaction no matter what happens
{ {
// WHEN // WHEN
tx1.acquireWriteLock( commonNode ); tx1.acquireWriteLock( commonNode );
Expand All @@ -313,12 +315,12 @@ public Node call() throws Exception
{ {
// THEN -- deadlock should be avoided with this exception // THEN -- deadlock should be avoided with this exception
} }
finally catch( ExecutionException e )
{ {
tx1.close(); // OR -- the tx2 thread fails with executionexception, caused by deadlock on its end
assertThat( e.getCause(), instanceOf( DeadlockDetectedException.class ) );
} }


thread2.execute( new FinishTx( tx2, true ) );
thread2.close(); thread2.close();
} }


Expand Down
@@ -0,0 +1,145 @@
/*
* Copyright (c) 2002-2016 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.kernel.impl.enterprise.lock.forseti;

import org.neo4j.unsafe.impl.internal.dragons.FeatureToggles;

public enum DeadlockStrategies implements ForsetiLockManager.DeadlockResolutionStrategy
{
/**
* When a deadlock occurs, the client with the fewest number of held locks is aborted. If both clients hold the same number of
* locks, the client with the lowest client id is aborted.
*
* This is one side of a long academic argument, where the other says to abort the one with the most locks held, since it's old and monolithic and holding up
* the line.
*/
ABORT_YOUNG
{
@Override
public boolean shouldAbort( ForsetiClient clientThatsAsking, ForsetiClient clientWereDeadlockedWith )
{
if( isSameClient(clientThatsAsking, clientWereDeadlockedWith ))
{
return true;
}

int ourCount = clientThatsAsking.lockCount();
int otherCount = clientWereDeadlockedWith.lockCount();
if( ourCount > otherCount )
{
// We hold more locks than the other client, we stay the course!
return false;
}
else if( otherCount > ourCount )
{
// Other client holds more locks than us, yield to her seniority
return true;
}
else
{
return clientThatsAsking.id() >= clientWereDeadlockedWith.id(); // >= to guard against bugs where a client thinks its deadlocked itself
}
}
},

/**
* When a deadlock occurs, the client with the highest number of held locks is aborted. If both clients hold the same number of
* locks, the client with the highest client id is aborted.
*/
ABORT_OLD
{
@Override
public boolean shouldAbort( ForsetiClient clientThatsAsking, ForsetiClient clientWereDeadlockedWith )
{
if( isSameClient(clientThatsAsking, clientWereDeadlockedWith ))
{
return true;
}

return !ABORT_YOUNG.shouldAbort( clientThatsAsking, clientWereDeadlockedWith );
}
},

/**
* When a deadlock occurs, the client that is blocking the lowest number of other clients aborts.
* If both clients have the same sized wait lists, the one with the lowest client id is aborted.
*/
ABORT_SHORT_WAIT_LIST
{
@Override
public boolean shouldAbort( ForsetiClient clientThatsAsking, ForsetiClient clientWereDeadlockedWith )
{
if( isSameClient(clientThatsAsking, clientWereDeadlockedWith ))
{
return true;
}

int ourCount = clientThatsAsking.waitListSize();
int otherCount = clientWereDeadlockedWith.waitListSize();
if( ourCount > otherCount )
{
// We have a larger wait list than the other client, we stay the course
return false;
}
else if( otherCount > ourCount )
{
// Other client has a longer wait list, we yield
return true;
}
else
{
return clientThatsAsking.id() > clientWereDeadlockedWith.id();
}
}
},

/**
* When a deadlock occurs, the client that is blocking the highest number of other clients aborts.
* If both clients have the same sized wait lists, the one with the highest client id is aborted.
*/
ABORT_LONG_WAIT_LIST
{
@Override
public boolean shouldAbort( ForsetiClient clientThatsAsking, ForsetiClient clientWereDeadlockedWith )
{
if( isSameClient(clientThatsAsking, clientWereDeadlockedWith ))
{
return true;
}
return !ABORT_SHORT_WAIT_LIST.shouldAbort( clientThatsAsking, clientWereDeadlockedWith );
}
}
;

@Override
public abstract boolean shouldAbort( ForsetiClient clientThatsAsking, ForsetiClient clientWereDeadlockedWith );

/**
* To aid in experimental testing of strategies on different real workloads, allow toggling which strategy to use.
*/
public static ForsetiLockManager.DeadlockResolutionStrategy DEFAULT = FeatureToggles.flag( DeadlockStrategies.class, "strategy", ABORT_YOUNG );

private static boolean isSameClient( ForsetiClient a, ForsetiClient b )
{
// This should never happen, but as a safety net, guard against bugs
// where a client thinks it's deadlocked with itself.
return a.id() == b.id();
}
}
Expand Up @@ -37,15 +37,9 @@ public void copyHolderWaitListsInto( SimpleBitSet waitList )
} }


@Override @Override
public int holderWaitListSize() public int detectDeadlock( int client )
{ {
return owner.waitListSize(); return owner.isWaitingFor( client ) ? owner.id() : -1;
}

@Override
public boolean anyHolderIsWaitingFor( int client )
{
return owner.isWaitingFor( client );
} }


@Override @Override
Expand Down

0 comments on commit c8fec4c

Please sign in to comment.