diff --git a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ExclusiveLock.java b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ExclusiveLock.java index 5e79999fde1b..58e262efd749 100644 --- a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ExclusiveLock.java +++ b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ExclusiveLock.java @@ -19,6 +19,8 @@ */ package org.neo4j.kernel.impl.enterprise.lock.forseti; +import java.util.Set; + import org.neo4j.kernel.impl.util.collection.SimpleBitSet; class ExclusiveLock implements ForsetiLockManager.Lock @@ -48,6 +50,12 @@ public String describeWaitList() return "ExclusiveLock[" + owner.describeWaitList() + "]"; } + @Override + public void collectOwners( Set owners ) + { + owners.add( owner ); + } + @Override public String toString() { diff --git a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiClient.java b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiClient.java index 014bd969e3b9..84e2818f8f49 100644 --- a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiClient.java +++ b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiClient.java @@ -19,6 +19,8 @@ */ package org.neo4j.kernel.impl.enterprise.lock.forseti; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.function.IntFunction; @@ -103,6 +105,12 @@ public class ForsetiClient implements Locks.Client private volatile boolean hasLocks; + /** + * When we *wait* for a specific lock to be released to us, we assign it to this field. This helps us during the + * secondary deadlock verification process, where we traverse the waiter/lock-owner dependency graph. + */ + private volatile ForsetiLockManager.Lock waitingForLock; + public ForsetiClient( int id, ConcurrentMap[] lockMaps, WaitStrategy[] waitStrategies, @@ -223,10 +231,8 @@ else if ( existingLock instanceof ExclusiveLock ) throw new UnsupportedOperationException( "Unknown lock type: " + existingLock ); } - applyWaitStrategy( resourceType, tries++ ); - // And take note of who we are waiting for. This is used for deadlock detection. - markAsWaitingFor( existingLock, resourceType, resourceId ); + waitFor( existingLock, resourceType, resourceId, tries++ ); } // Got the lock, no longer waiting for anyone. @@ -283,8 +289,7 @@ public void acquireExclusive( ResourceType resourceType, long... resourceIds ) t } } - applyWaitStrategy( resourceType, tries++ ); - markAsWaitingFor( existingLock, resourceType, resourceId ); + waitFor( existingLock, resourceType, resourceId, tries++ ); } clearWaitList(); @@ -598,13 +603,10 @@ public int waitListSize() public void copyWaitListTo( SimpleBitSet other ) { other.put( waitList ); - // TODO It might make sense to somehow put a StoreLoad barrier here, - // TODO to expedite the observation of the updated waitList in other clients. } public boolean isWaitingFor( int clientId ) { - // TODO Similarly to the above, make reading the waitList a volatile load. return clientId != this.clientId && waitList.contains( clientId ); } @@ -730,8 +732,7 @@ private boolean tryUpgradeToExclusiveWithShareLockHeld( // Now we just wait for all clients to release the the share lock while ( sharedLock.numberOfHolders() > 1 ) { - applyWaitStrategy( resourceType, tries++ ); - markAsWaitingFor( sharedLock, resourceType, resourceId ); + waitFor( sharedLock, resourceType, resourceId, tries++ ); } return true; @@ -770,10 +771,12 @@ private void clearWaitList() waitList.put( clientId ); } - private void markAsWaitingFor( ForsetiLockManager.Lock lock, ResourceType type, long resourceId ) + private void waitFor( ForsetiLockManager.Lock lock, ResourceType type, long resourceId, int tries ) { + waitingForLock = lock; clearWaitList(); lock.copyHolderWaitListsInto( waitList ); + applyWaitStrategy( type, tries ); int b = lock.detectDeadlock( id() ); if ( b != -1 && deadlockResolutionStrategy.shouldAbort( this, clientById.apply( b ) ) ) @@ -791,10 +794,61 @@ private void markAsWaitingFor( ForsetiLockManager.Lock lock, ResourceType type, // after we've generated a description of it. if ( lock.detectDeadlock( id() ) != -1 ) { - waitList.clear(); - throw new DeadlockDetectedException( message ); + // If the deadlock is real, then an owner of this lock must be (transitively) waiting on a lock that + // we own. So to verify the deadlock, we traverse the lock owners and their `waitingForLock` fields, + // to find a lock that has us among the owners. + // We only act upon the result of this method if the `tries` count is above some threshold. The reason + // is that the Lock.collectOwners, which is algorithm relies upon, is inherently racy, and so only + // reduces the probably of a false positive, but does not eliminate them. + if ( isDeadlockReal( lock ) && tries > 10 ) + { + // After checking several times, this really does look like a real deadlock. + waitList.clear(); + waitingForLock = null; + throw new DeadlockDetectedException( message ); + } + } + } + waitingForLock = null; + } + + private boolean isDeadlockReal( ForsetiLockManager.Lock lock ) + { + Set waitedUpon = new HashSet<>(); + Set owners = new HashSet<>(); + Set nextWaitedUpon = new HashSet<>(); + Set nextOwners = new HashSet<>(); + lock.collectOwners( owners ); + + do + { + waitedUpon.addAll( nextWaitedUpon ); + nextWaitedUpon.clear(); + for ( ForsetiClient owner : owners ) + { + ForsetiLockManager.Lock waitingForLock = owner.waitingForLock; + if ( waitingForLock != null && !waitedUpon.contains( waitingForLock ) ) + { + nextWaitedUpon.add( waitingForLock ); + } } + for ( ForsetiLockManager.Lock lck : nextWaitedUpon ) + { + lck.collectOwners( nextOwners ); + } + if ( nextOwners.contains( this ) ) + { + // Yes, deadlock looks real. + return true; + } + owners.clear(); + Set ownersTmp = owners; + owners = nextOwners; + nextOwners = ownersTmp; } + while ( !nextWaitedUpon.isEmpty() ); + // Nope, we didn't find any real wait cycles. + return false; } /** diff --git a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiLockManager.java b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiLockManager.java index e6248485835f..37208a7a3f02 100644 --- a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiLockManager.java +++ b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiLockManager.java @@ -21,6 +21,7 @@ import java.util.Map; import java.util.Queue; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; @@ -76,7 +77,7 @@ *

* A.waitlist = [A] U [B] => [A,B] *

- * It will do this in a loop, continiously figuring out the union of wait lists for all clients it waits for. The magic + * It will do this in a loop, continuously figuring out the union of wait lists for all clients it waits for. The magic * then happens whenever one of those clients become blocked on client A. Assuming client B now has to wait for A, * it will also perform a union of A's wait list (which is [A,B] at this point): *

@@ -84,6 +85,23 @@ *

* As it performs this union, B will find itself in A's waiting list, and when it does, it has detected a deadlock. *

+ * This algorithm always identifies real deadlocks, but it may also mistakenly identify a deadlock where there is none; + * a false positive. For this reason, we have a secondary deadlock verification algorithm that only runs if the + * algorithm above found what appears to be a deadlock. + *

+ * The secondary deadlock verification algorithm works like this: Whenever a lock client blocks to wait on a lock, the + * lock is stored in the clients `waitsFor` field, and the field is cleared when the client unblocks. Since every lock + * track their owners, we now have all the information we need to traverse the waiter/lock-holder dependency graph to + * verify that a cycle really does exist. + *

+ * We first collect the owners of the lock that we are blocking upon. From there, we need to find a lock that one of + * these lock-owners are waiting on, and have us amongst its owners. So to recap, we collect the immediate owners of + * the lock that we are immediately blocked upon, then we collect the set of locks that they are waiting upon, and then + * we collect the combined set of owners of those locks, and if we are amongst those, then we consider the + * deadlock is real. If we are not amongst those owners, then we take another step out into the graph, collect the next + * frontier of locks that are waited upon, and their owners, and then we check again in this new owner set. We continue + * traversing the graph like this until we either find ourselves amongst the owners - a deadlock - or we run out of + * locks that are being waited upon - no deadlock. *

*

Future work

*

@@ -121,6 +139,17 @@ interface Lock * for the lock. */ String describeWaitList(); + + /** + * Collect the current owners of this lock into the given set. This is used for verifying that apparent + * deadlocks really do involve circular wait dependencies. + * + * Note that the owner set may change while this method is running, and thus it is not guaranteed to reflect any + * particular snapshot of the set of lock owners. Furthermore, the set may change arbitrarily after the method + * returns, immediately rendering the result outdated. + * @param owners The set into which to collect the current owners of this lock. + */ + void collectOwners( Set owners ); } /** @@ -254,8 +283,6 @@ private static class ForsetiClientFlyweightPool extends LinkedQueuePool unusedIds = new ConcurrentLinkedQueue<>(); private final ConcurrentMap clientsById = new ConcurrentHashMap<>(); private final ConcurrentMap[] lockMaps; diff --git a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/SharedLock.java b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/SharedLock.java index c2ec00afc091..cec98478ea2a 100644 --- a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/SharedLock.java +++ b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/enterprise/lock/forseti/SharedLock.java @@ -19,6 +19,7 @@ */ package org.neo4j.kernel.impl.enterprise.lock.forseti; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReferenceArray; @@ -213,6 +214,26 @@ public String describeWaitList() return sb.append( "]" ).toString(); } + @Override + public void collectOwners( Set owners ) + { + for ( AtomicReferenceArray ownerArray : clientsHoldingThisLock ) + { + if ( ownerArray != null ) + { + int len = ownerArray.length(); + for ( int i = 0; i < len; i++ ) + { + ForsetiClient owner = ownerArray.get( i ); + if ( owner != null ) + { + owners.add( owner ); + } + } + } + } + } + @Override public String toString() { diff --git a/enterprise/kernel/src/test/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiFalseDeadlockTest.java b/enterprise/kernel/src/test/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiFalseDeadlockTest.java new file mode 100644 index 000000000000..211a197db51f --- /dev/null +++ b/enterprise/kernel/src/test/java/org/neo4j/kernel/impl/enterprise/lock/forseti/ForsetiFalseDeadlockTest.java @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2002-2017 "Neo Technology," + * Network Engine for Objects in Lund AB [http://neotechnology.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package org.neo4j.kernel.impl.enterprise.lock.forseti; + +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; + +import org.neo4j.concurrent.BinaryLatch; +import org.neo4j.function.ThrowingAction; +import org.neo4j.kernel.impl.locking.Locks; +import org.neo4j.kernel.impl.locking.community.CommunityLockManger; +import org.neo4j.kernel.impl.util.concurrent.LockWaitStrategies; +import org.neo4j.storageengine.api.lock.ResourceType; +import org.neo4j.storageengine.api.lock.WaitStrategy; + +@RunWith( Parameterized.class ) +public class ForsetiFalseDeadlockTest +{ + public static class Fixture + { + private final int iterations; + private final LockManager lockManager; + private final WaitStrategy waitStrategy; + private final LockType lockTypeAX; + private final LockType lockTypeAY; + private final LockType lockTypeBX; + private final LockType lockTypeBY; + + Fixture( int iterations, + LockManager lockManager, + WaitStrategy waitStrategy, + LockType lockTypeAX, + LockType lockTypeAY, + LockType lockTypeBX, + LockType lockTypeBY ) + { + this.iterations = iterations; + this.lockManager = lockManager; + this.waitStrategy = waitStrategy; + this.lockTypeAX = lockTypeAX; + this.lockTypeAY = lockTypeAY; + this.lockTypeBX = lockTypeBX; + this.lockTypeBY = lockTypeBY; + } + + int iterations() + { + return iterations; + } + + Locks createLockManager( ResourceType resourceType ) + { + return lockManager.create( resourceType ); + } + + ResourceType createResourceType() + { + return new ResourceType() + { + @Override + public int typeId() + { + return 0; + } + + @Override + public WaitStrategy waitStrategy() + { + return waitStrategy; + } + }; + } + + void acquireAX( Locks.Client client, ResourceType resourceType ) + { + lockTypeAX.acquire( client, resourceType, 1 ); + } + + void releaseAX( Locks.Client client, ResourceType resourceType ) + { + lockTypeAX.release( client, resourceType, 1 ); + } + + void acquireAY( Locks.Client client, ResourceType resourceType ) + { + lockTypeAY.acquire( client, resourceType, 2 ); + } + + void releaseAY( Locks.Client client, ResourceType resourceType ) + { + lockTypeAY.release( client, resourceType, 2 ); + } + + void acquireBX( Locks.Client client, ResourceType resourceType ) + { + lockTypeBX.acquire( client, resourceType, 1 ); + } + + void releaseBX( Locks.Client client, ResourceType resourceType ) + { + lockTypeBX.release( client, resourceType, 1 ); + } + + void acquireBY( Locks.Client client, ResourceType resourceType ) + { + lockTypeBY.acquire( client, resourceType, 2 ); + } + + void releaseBY( Locks.Client client, ResourceType resourceType ) + { + lockTypeBY.release( client, resourceType, 2 ); + } + + @Override + public String toString() + { + return "iterations=" + iterations + + ", lockManager=" + lockManager + + ", waitStrategy=" + waitStrategy + + ", lockTypeAX=" + lockTypeAX + + ", lockTypeAY=" + lockTypeAY + + ", lockTypeBX=" + lockTypeBX + + ", lockTypeBY=" + lockTypeBY; + } + } + + public enum LockType + { + EXCLUSIVE + { + @Override + public void acquire( Locks.Client client, ResourceType resourceType, int resource ) + { + client.acquireExclusive( resourceType, resource ); + } + + @Override + public void release( Locks.Client client, ResourceType resourceType, int resource ) + { + client.releaseExclusive( resourceType, resource ); + } + }, + SHARED + { + @Override + public void acquire( Locks.Client client, ResourceType resourceType, int resource ) + { + client.acquireShared( resourceType, resource ); + } + + @Override + public void release( Locks.Client client, ResourceType resourceType, int resource ) + { + client.releaseShared( resourceType, resource ); + } + }; + + public abstract void acquire( Locks.Client client, ResourceType resourceType, int resource ); + + public abstract void release( Locks.Client client, ResourceType resourceType, int resource ); + } + + public enum LockManager + { + COMMUNITY + { + @Override + public Locks create( ResourceType resourceType ) + { + return new CommunityLockManger(); + } + }, + FORSETI + { + @Override + public Locks create( ResourceType resourceType ) + { + return new ForsetiLockManager( resourceType ); + } + }; + + public abstract Locks create( ResourceType resourceType ); + } + + @Parameterized.Parameters( name = "{0}" ) + public static Iterable parameters() + { + List fixtures = new ArrayList<>(); + + // During development I also had iteration counts 1 and 2 here, but they never found anything, so for actually + // running this test, I leave only iteration count 100 enabled. + int[] iterations = new int[]{100}; + LockManager[] lockManagers = LockManager.values(); + LockWaitStrategies[] lockWaitStrategies = LockWaitStrategies.values(); + LockType[] lockTypes = LockType.values(); + for ( LockManager lockManager : lockManagers ) + { + for ( LockWaitStrategies waitStrategy : lockWaitStrategies ) + { + for ( LockType lockTypeAX : lockTypes ) + { + for ( LockType lockTypeAY : lockTypes ) + { + for ( LockType lockTypeBX : lockTypes ) + { + for ( LockType lockTypeBY : lockTypes ) + { + for ( int iteration : iterations ) + { + fixtures.add( new Fixture( + iteration, lockManager, waitStrategy, + lockTypeAX, lockTypeAY, lockTypeBX, lockTypeBY ) ); + } + } + } + } + } + } + } + + return fixtures.stream().map( f -> new Object[]{f} ).collect( Collectors.toList() ); + } + + private static ExecutorService executor = Executors.newCachedThreadPool( r -> + { + Thread thread = new Thread( r ); + thread.setDaemon( true ); + return thread; + } ); + + public ForsetiFalseDeadlockTest( Fixture fixture ) + { + this.fixture = fixture; + } + + private final Fixture fixture; + + /** + * This takes a fair bit of time, and we don't want to wait that long. But more importantly, false deadlocks are + * still only very unlikely; they are not impossible, though. + * + * So this test is technically flaky, but the probability is, I think, quite low for the 'mild' test, but it is too + * high for this aggressive test. So therefor I have marked it as @Ignored, but I still keep it here for the sake + * of this comment, and to allow others to run it and get a feel for the probabilities involved. + */ + @Ignore + @Test + public void testAggressivelyForFalseDeadlocks() throws Exception + { + int testRuns = 2000; + loopRunTest( testRuns ); + } + + @Test + public void testMildlyForFalseDeadlocks() throws Exception + { + int testRuns = 10; + loopRunTest( testRuns ); + } + + private void loopRunTest( int testRuns ) throws InterruptedException, java.util.concurrent.ExecutionException + { + for ( int i = 0; i < testRuns; i++ ) + { + try + { + runTest(); + } + catch ( Throwable th ) + { + th.addSuppressed( new Exception( "Failed at iteration " + i ) ); + throw th; + } + } + } + + private void runTest() throws InterruptedException, java.util.concurrent.ExecutionException + { + int iterations = fixture.iterations(); + ResourceType resourceType = fixture.createResourceType(); + Locks manager = fixture.createLockManager( resourceType ); + try ( Locks.Client a = manager.newClient(); + Locks.Client b = manager.newClient() ) + { + BinaryLatch startLatch = new BinaryLatch(); + BlockedCallable callA = new BlockedCallable( startLatch, + () -> workloadA( a, resourceType, iterations ) ); + BlockedCallable callB = new BlockedCallable( startLatch, + () -> workloadB( b, resourceType, iterations ) ); + + Future futureA = executor.submit( callA ); + Future futureB = executor.submit( callB ); + + callA.awaitBlocked(); + callB.awaitBlocked(); + + startLatch.release(); + + futureA.get(); + futureB.get(); + } + finally + { + manager.close(); + } + } + + private void workloadA( Locks.Client a, ResourceType resourceType, int iterations ) + { + for ( int i = 0; i < iterations; i++ ) + { + fixture.acquireAX( a, resourceType ); + fixture.acquireAY( a, resourceType ); + fixture.releaseAY( a, resourceType ); + fixture.releaseAX( a, resourceType ); + } + } + + private void workloadB( Locks.Client b, ResourceType resourceType, int iterations ) + { + for ( int i = 0; i < iterations; i++ ) + { + fixture.acquireBX( b, resourceType ); + fixture.releaseBX( b, resourceType ); + fixture.acquireBY( b, resourceType ); + fixture.releaseBY( b, resourceType ); + } + } + + public static class BlockedCallable implements Callable + { + private final BinaryLatch startLatch; + private final ThrowingAction delegate; + private volatile Thread runner; + + BlockedCallable( BinaryLatch startLatch, ThrowingAction delegate ) + { + this.startLatch = startLatch; + this.delegate = delegate; + } + + @Override + public Void call() throws Exception + { + runner = Thread.currentThread(); + startLatch.await(); + delegate.apply(); + return null; + } + + void awaitBlocked() + { + Thread t; + do + { + t = runner; + } + while ( t == null || t.getState() != Thread.State.WAITING ); + } + } +}