Skip to content

Commit

Permalink
Reduce the probability of getting false positive deadlocks from Forseti
Browse files Browse the repository at this point in the history
Note that false positives are an inherent risk with the Dreadlocks algorithm.
With this change, we still use Dreadlocks for the initial deadlock test, but if
one is found we follow up with a more thorough search of the waiter/lock-holder
dependency graph. Unfortunately, this search is _also_ probabilistic and may
find false positives. However, we only act upon the deadlock result if we have
made at least 10 attempts at grabbing the lock.

The two different deadlock tests, combined requiring 10 consequitive positive
deadlock results, means that the probability of a false positive making it all
the way through, is now considerably lower than it was before.
  • Loading branch information
chrisvest committed Jun 2, 2017
1 parent 479e58d commit 9d58bb2
Show file tree
Hide file tree
Showing 5 changed files with 514 additions and 16 deletions.
Expand Up @@ -19,6 +19,8 @@
*/ */
package org.neo4j.kernel.impl.enterprise.lock.forseti; package org.neo4j.kernel.impl.enterprise.lock.forseti;


import java.util.Set;

import org.neo4j.kernel.impl.util.collection.SimpleBitSet; import org.neo4j.kernel.impl.util.collection.SimpleBitSet;


class ExclusiveLock implements ForsetiLockManager.Lock class ExclusiveLock implements ForsetiLockManager.Lock
Expand Down Expand Up @@ -48,6 +50,12 @@ public String describeWaitList()
return "ExclusiveLock[" + owner.describeWaitList() + "]"; return "ExclusiveLock[" + owner.describeWaitList() + "]";
} }


@Override
public void collectOwners( Set<ForsetiClient> owners )
{
owners.add( owner );
}

@Override @Override
public String toString() public String toString()
{ {
Expand Down
Expand Up @@ -19,6 +19,8 @@
*/ */
package org.neo4j.kernel.impl.enterprise.lock.forseti; package org.neo4j.kernel.impl.enterprise.lock.forseti;


import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import java.util.function.IntFunction; import java.util.function.IntFunction;


Expand Down Expand Up @@ -103,6 +105,12 @@ public class ForsetiClient implements Locks.Client


private volatile boolean hasLocks; private volatile boolean hasLocks;


/**
* When we *wait* for a specific lock to be released to us, we assign it to this field. This helps us during the
* secondary deadlock verification process, where we traverse the waiter/lock-owner dependency graph.
*/
private volatile ForsetiLockManager.Lock waitingForLock;

public ForsetiClient( int id, public ForsetiClient( int id,
ConcurrentMap<Long,ForsetiLockManager.Lock>[] lockMaps, ConcurrentMap<Long,ForsetiLockManager.Lock>[] lockMaps,
WaitStrategy<AcquireLockTimeoutException>[] waitStrategies, WaitStrategy<AcquireLockTimeoutException>[] waitStrategies,
Expand Down Expand Up @@ -223,10 +231,8 @@ else if ( existingLock instanceof ExclusiveLock )
throw new UnsupportedOperationException( "Unknown lock type: " + existingLock ); throw new UnsupportedOperationException( "Unknown lock type: " + existingLock );
} }


applyWaitStrategy( resourceType, tries++ );

// And take note of who we are waiting for. This is used for deadlock detection. // And take note of who we are waiting for. This is used for deadlock detection.
markAsWaitingFor( existingLock, resourceType, resourceId ); waitFor( existingLock, resourceType, resourceId, tries++ );
} }


// Got the lock, no longer waiting for anyone. // Got the lock, no longer waiting for anyone.
Expand Down Expand Up @@ -283,8 +289,7 @@ public void acquireExclusive( ResourceType resourceType, long... resourceIds ) t
} }
} }


applyWaitStrategy( resourceType, tries++ ); waitFor( existingLock, resourceType, resourceId, tries++ );
markAsWaitingFor( existingLock, resourceType, resourceId );
} }


clearWaitList(); clearWaitList();
Expand Down Expand Up @@ -598,13 +603,10 @@ public int waitListSize()
public void copyWaitListTo( SimpleBitSet other ) public void copyWaitListTo( SimpleBitSet other )
{ {
other.put( waitList ); other.put( waitList );
// TODO It might make sense to somehow put a StoreLoad barrier here,
// TODO to expedite the observation of the updated waitList in other clients.
} }


public boolean isWaitingFor( int clientId ) public boolean isWaitingFor( int clientId )
{ {
// TODO Similarly to the above, make reading the waitList a volatile load.
return clientId != this.clientId && waitList.contains( clientId ); return clientId != this.clientId && waitList.contains( clientId );
} }


Expand Down Expand Up @@ -730,8 +732,7 @@ private boolean tryUpgradeToExclusiveWithShareLockHeld(
// Now we just wait for all clients to release the the share lock // Now we just wait for all clients to release the the share lock
while ( sharedLock.numberOfHolders() > 1 ) while ( sharedLock.numberOfHolders() > 1 )
{ {
applyWaitStrategy( resourceType, tries++ ); waitFor( sharedLock, resourceType, resourceId, tries++ );
markAsWaitingFor( sharedLock, resourceType, resourceId );
} }


return true; return true;
Expand Down Expand Up @@ -770,10 +771,12 @@ private void clearWaitList()
waitList.put( clientId ); waitList.put( clientId );
} }


private void markAsWaitingFor( ForsetiLockManager.Lock lock, ResourceType type, long resourceId ) private void waitFor( ForsetiLockManager.Lock lock, ResourceType type, long resourceId, int tries )
{ {
waitingForLock = lock;
clearWaitList(); clearWaitList();
lock.copyHolderWaitListsInto( waitList ); lock.copyHolderWaitListsInto( waitList );
applyWaitStrategy( type, tries );


int b = lock.detectDeadlock( id() ); int b = lock.detectDeadlock( id() );
if ( b != -1 && deadlockResolutionStrategy.shouldAbort( this, clientById.apply( b ) ) ) if ( b != -1 && deadlockResolutionStrategy.shouldAbort( this, clientById.apply( b ) ) )
Expand All @@ -791,10 +794,61 @@ private void markAsWaitingFor( ForsetiLockManager.Lock lock, ResourceType type,
// after we've generated a description of it. // after we've generated a description of it.
if ( lock.detectDeadlock( id() ) != -1 ) if ( lock.detectDeadlock( id() ) != -1 )
{ {
waitList.clear(); // If the deadlock is real, then an owner of this lock must be (transitively) waiting on a lock that
throw new DeadlockDetectedException( message ); // we own. So to verify the deadlock, we traverse the lock owners and their `waitingForLock` fields,
// to find a lock that has us among the owners.
// We only act upon the result of this method if the `tries` count is above some threshold. The reason
// is that the Lock.collectOwners, which is algorithm relies upon, is inherently racy, and so only
// reduces the probably of a false positive, but does not eliminate them.
if ( isDeadlockReal( lock ) && tries > 10 )
{
// After checking several times, this really does look like a real deadlock.
waitList.clear();
waitingForLock = null;
throw new DeadlockDetectedException( message );
}
}
}
waitingForLock = null;
}

private boolean isDeadlockReal( ForsetiLockManager.Lock lock )
{
Set<ForsetiLockManager.Lock> waitedUpon = new HashSet<>();
Set<ForsetiClient> owners = new HashSet<>();
Set<ForsetiLockManager.Lock> nextWaitedUpon = new HashSet<>();
Set<ForsetiClient> nextOwners = new HashSet<>();
lock.collectOwners( owners );

do
{
waitedUpon.addAll( nextWaitedUpon );
nextWaitedUpon.clear();
for ( ForsetiClient owner : owners )
{
ForsetiLockManager.Lock waitingForLock = owner.waitingForLock;
if ( waitingForLock != null && !waitedUpon.contains( waitingForLock ) )
{
nextWaitedUpon.add( waitingForLock );
}
} }
for ( ForsetiLockManager.Lock lck : nextWaitedUpon )
{
lck.collectOwners( nextOwners );
}
if ( nextOwners.contains( this ) )
{
// Yes, deadlock looks real.
return true;
}
owners.clear();
Set<ForsetiClient> ownersTmp = owners;
owners = nextOwners;
nextOwners = ownersTmp;
} }
while ( !nextWaitedUpon.isEmpty() );
// Nope, we didn't find any real wait cycles.
return false;
} }


/** /**
Expand Down
Expand Up @@ -21,6 +21,7 @@


import java.util.Map; import java.util.Map;
import java.util.Queue; import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
Expand Down Expand Up @@ -76,14 +77,31 @@
* <p/> * <p/>
* A.waitlist = [A] U [B] => [A,B] * A.waitlist = [A] U [B] => [A,B]
* <p/> * <p/>
* It will do this in a loop, continiously figuring out the union of wait lists for all clients it waits for. The magic * It will do this in a loop, continuously figuring out the union of wait lists for all clients it waits for. The magic
* then happens whenever one of those clients become blocked on client A. Assuming client B now has to wait for A, * then happens whenever one of those clients become blocked on client A. Assuming client B now has to wait for A,
* it will also perform a union of A's wait list (which is [A,B] at this point): * it will also perform a union of A's wait list (which is [A,B] at this point):
* <p/> * <p/>
* B.waitlist = [B] U [A,B] * B.waitlist = [B] U [A,B]
* <p/> * <p/>
* As it performs this union, B will find itself in A's waiting list, and when it does, it has detected a deadlock. * As it performs this union, B will find itself in A's waiting list, and when it does, it has detected a deadlock.
* <p/> * <p/>
* This algorithm always identifies real deadlocks, but it may also mistakenly identify a deadlock where there is none;
* a false positive. For this reason, we have a secondary deadlock verification algorithm that only runs if the
* algorithm above found what appears to be a deadlock.
* <p/>
* The secondary deadlock verification algorithm works like this: Whenever a lock client blocks to wait on a lock, the
* lock is stored in the clients `waitsFor` field, and the field is cleared when the client unblocks. Since every lock
* track their owners, we now have all the information we need to traverse the waiter/lock-holder dependency graph to
* verify that a cycle really does exist.
* <p/>
* We first collect the owners of the lock that we are blocking upon. From there, we need to find a lock that one of
* these lock-owners are waiting on, and have us amongst its owners. So to recap, we collect the immediate owners of
* the lock that we are immediately blocked upon, then we collect the set of locks that they are waiting upon, and then
* we collect the combined set of owners of <em>those</em> locks, and if we are amongst those, then we consider the
* deadlock is real. If we are not amongst those owners, then we take another step out into the graph, collect the next
* frontier of locks that are waited upon, and their owners, and then we check again in this new owner set. We continue
* traversing the graph like this until we either find ourselves amongst the owners - a deadlock - or we run out of
* locks that are being waited upon - no deadlock.
* <p/> * <p/>
* <h2>Future work</h2> * <h2>Future work</h2>
* <p/> * <p/>
Expand Down Expand Up @@ -121,6 +139,17 @@ interface Lock
* for the lock. * for the lock.
*/ */
String describeWaitList(); String describeWaitList();

/**
* Collect the current owners of this lock into the given set. This is used for verifying that apparent
* deadlocks really do involve circular wait dependencies.
*
* Note that the owner set may change while this method is running, and thus it is not guaranteed to reflect any
* particular snapshot of the set of lock owners. Furthermore, the set may change arbitrarily after the method
* returns, immediately rendering the result outdated.
* @param owners The set into which to collect the current owners of this lock.
*/
void collectOwners( Set<ForsetiClient> owners );
} }


/** /**
Expand Down Expand Up @@ -254,8 +283,6 @@ private static class ForsetiClientFlyweightPool extends LinkedQueuePool<ForsetiC
private final AtomicInteger clientIds = new AtomicInteger( 0 ); private final AtomicInteger clientIds = new AtomicInteger( 0 );


/** Re-use ids, forseti uses these in arrays, so we want to keep them low and not loose them. */ /** Re-use ids, forseti uses these in arrays, so we want to keep them low and not loose them. */
// TODO we could use a synchronised SimpleBitSet instead, since we know that we only care about reusing a
// very limited set of integers.
private final Queue<Integer> unusedIds = new ConcurrentLinkedQueue<>(); private final Queue<Integer> unusedIds = new ConcurrentLinkedQueue<>();
private final ConcurrentMap<Integer,ForsetiClient> clientsById = new ConcurrentHashMap<>(); private final ConcurrentMap<Integer,ForsetiClient> clientsById = new ConcurrentHashMap<>();
private final ConcurrentMap<Long,ForsetiLockManager.Lock>[] lockMaps; private final ConcurrentMap<Long,ForsetiLockManager.Lock>[] lockMaps;
Expand Down
Expand Up @@ -19,6 +19,7 @@
*/ */
package org.neo4j.kernel.impl.enterprise.lock.forseti; package org.neo4j.kernel.impl.enterprise.lock.forseti;


import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReferenceArray; import java.util.concurrent.atomic.AtomicReferenceArray;


Expand Down Expand Up @@ -213,6 +214,26 @@ public String describeWaitList()
return sb.append( "]" ).toString(); return sb.append( "]" ).toString();
} }


@Override
public void collectOwners( Set<ForsetiClient> owners )
{
for ( AtomicReferenceArray<ForsetiClient> ownerArray : clientsHoldingThisLock )
{
if ( ownerArray != null )
{
int len = ownerArray.length();
for ( int i = 0; i < len; i++ )
{
ForsetiClient owner = ownerArray.get( i );
if ( owner != null )
{
owners.add( owner );
}
}
}
}
}

@Override @Override
public String toString() public String toString()
{ {
Expand Down

0 comments on commit 9d58bb2

Please sign in to comment.