Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolves a deadlock scenario applying constraint #9110

Merged
merged 4 commits into from
Apr 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@

import org.neo4j.kernel.impl.store.counts.CountsTracker;
import org.neo4j.kernel.impl.transaction.command.Command;
import org.neo4j.kernel.impl.transaction.command.Command.SchemaRuleCommand;
import org.neo4j.storageengine.api.TransactionApplicationMode;

public class CountsStoreTransactionApplier extends TransactionApplier.Adapter
{
private final TransactionApplicationMode mode;
private final CountsTracker.Updater countsUpdater;
private boolean haveUpdates;

public CountsStoreTransactionApplier( TransactionApplicationMode mode, CountsAccessor.Updater countsUpdater )
{
Expand All @@ -40,6 +42,11 @@ public CountsStoreTransactionApplier( TransactionApplicationMode mode, CountsAcc
public void close() throws Exception
{
assert countsUpdater != null || mode == TransactionApplicationMode.RECOVERY : "You must call begin first";
closeCountsUpdaterIfOpen();
}

private void closeCountsUpdaterIfOpen()
{
if ( countsUpdater != null )
{ // CountsUpdater is null if we're in recovery and the counts store already has had this transaction applied.
countsUpdater.close();
Expand All @@ -50,6 +57,7 @@ public void close() throws Exception
public boolean visitNodeCountsCommand( Command.NodeCountsCommand command )
{
assert countsUpdater != null || mode == TransactionApplicationMode.RECOVERY : "You must call begin first";
haveUpdates = true;
if ( countsUpdater != null )
{ // CountsUpdater is null if we're in recovery and the counts store already has had this transaction applied.
countsUpdater.incrementNodeCount( command.labelId(), command.delta() );
Expand All @@ -61,11 +69,25 @@ public boolean visitNodeCountsCommand( Command.NodeCountsCommand command )
public boolean visitRelationshipCountsCommand( Command.RelationshipCountsCommand command ) throws IOException
{
assert countsUpdater != null || mode == TransactionApplicationMode.RECOVERY : "You must call begin first";
haveUpdates = true;
if ( countsUpdater != null )
{ // CountsUpdater is null if we're in recovery and the counts store already has had this transaction applied.
countsUpdater.incrementRelationshipCount(
command.startLabelId(), command.typeId(), command.endLabelId(), command.delta() );
}
return false;
}

@Override
public boolean visitSchemaRuleCommand( SchemaRuleCommand command ) throws IOException
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also as we just talk, would be nice to have guarding assertion that will fail in case when this assumption will brake

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, this has been added

{
// This shows that this transaction is a schema transaction, so it cannot have commands
// updating any counts anyway. Therefore the counts updater is closed right away.
// This also breaks an otherwise deadlocking scenario between check pointer, this applier
// and an index population thread wanting to apply index sampling to the counts store.
assert !haveUpdates : "Assumed that a schema transaction wouldn't also contain data commands affecting " +
"counts store, but was proven wrong with this transaction";
closeCountsUpdaterIfOpen();
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ protected BatchTransactionApplierFacade applier( TransactionApplicationMode mode
appliers.add( new CacheInvalidationBatchTransactionApplier( neoStores, cacheAccess ) );
}

// Counts store application
appliers.add( new CountsStoreBatchTransactionApplier( neoStores.getCounts(), mode ) );

// Schema index application
appliers.add( new IndexBatchTransactionApplier( indexingService, labelScanStoreSync, indexUpdatesSync,
neoStores.getNodeStore(), new PropertyLoader( neoStores ),
Expand All @@ -384,9 +387,6 @@ protected BatchTransactionApplierFacade applier( TransactionApplicationMode mode
new LegacyBatchIndexApplier( indexConfigStore, legacyIndexApplierLookup, legacyIndexTransactionOrdering,
mode ) );

// Counts store application
appliers.add( new CountsStoreBatchTransactionApplier( neoStores.getCounts(), mode ) );

// Perform the application
return new BatchTransactionApplierFacade(
appliers.toArray( new BatchTransactionApplier[appliers.size()] ) );
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
/*
* Copyright (c) 2002-2017 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.kernel.impl.transaction.log.checkpoint;

import org.junit.Rule;
import org.junit.Test;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Future;

import org.neo4j.graphdb.ConstraintViolationException;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.schema.ConstraintDefinition;
import org.neo4j.kernel.api.exceptions.TransactionFailureException;
import org.neo4j.kernel.impl.api.TransactionCommitProcess;
import org.neo4j.kernel.impl.api.TransactionToApply;
import org.neo4j.kernel.impl.api.index.IndexingService;
import org.neo4j.kernel.impl.locking.LockWrapper;
import org.neo4j.kernel.impl.transaction.TransactionRepresentation;
import org.neo4j.kernel.impl.transaction.log.LogicalTransactionStore;
import org.neo4j.kernel.impl.transaction.log.TransactionCursor;
import org.neo4j.kernel.impl.transaction.log.TransactionIdStore;
import org.neo4j.kernel.internal.GraphDatabaseAPI;
import org.neo4j.kernel.monitoring.Monitors;
import org.neo4j.test.Barrier;
import org.neo4j.test.EphemeralFileSystemRule;
import org.neo4j.test.OtherThreadRule;
import org.neo4j.test.TestGraphDatabaseFactory;
import org.neo4j.test.TestLabels;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import static java.util.concurrent.TimeUnit.SECONDS;

import static org.neo4j.helpers.collection.Iterables.single;
import static org.neo4j.kernel.impl.transaction.tracing.CommitEvent.NULL;
import static org.neo4j.storageengine.api.TransactionApplicationMode.EXTERNAL;

/**
* The scenario, which takes place on database instance applying constraint
* creation as an external transaction, looks like this:
*
* <ol>
* <li>
* Transaction T1 creates the constraint index and population P starts
* </li>
* <li>
* Transaction T2 which activates the constraint starts applying and now has a read lock on the counts store
* </li>
* <li>
* Check point triggers, wants to rotate counts store and so acquires its write lock.
* It will have to block, but doing so will also blocks further read lock requests
* </li>
* <li>
* T2 moves on to activate the constraint. Doing so means first waiting for the index to come online
* </li>
* <li>
* P moves on to flip after population, something which includes initializing some sample data in counts store
* for this index. Will block on the counts store read lock, completing the deadlock
* </li>
* </ol>
*/
public class CheckPointerConstraintCreationDeadlockIT
{
private static final Label LABEL = TestLabels.LABEL_ONE;
private static final String KEY = "key";

@Rule
public final EphemeralFileSystemRule fs = new EphemeralFileSystemRule();
@Rule
public final OtherThreadRule<Void> t2 = new OtherThreadRule<>( "T2" );
@Rule
public final OtherThreadRule<Void> t3 = new OtherThreadRule<>( "T3" );

@Test( timeout = 30_000 )
public void shouldNotDeadlock() throws Exception
{
List<TransactionRepresentation> transactions = createConstraintCreatingTransactions();
Monitors monitors = new Monitors();
GraphDatabaseAPI db = (GraphDatabaseAPI) new TestGraphDatabaseFactory()
.setMonitors( monitors ).newImpermanentDatabase();
Barrier.Control controller = new Barrier.Control();
boolean success = false;
try
{
IndexingService.Monitor monitor = new IndexingService.MonitorAdapter()
{
@Override
public void indexPopulationScanComplete()
{
controller.reached();
}
};
monitors.addMonitorListener( monitor );
Future<Object> applier = applyInT2( db, transactions );

controller.await();

// At this point the index population has completed and the population thread is ready to
// acquire the counts store read lock for initializing some samples there. We're starting the
// check pointer, which will eventually put itself in queue for acquiring the write lock

Future<Object> checkPointer = t3.execute( state ->
db.getDependencyResolver().resolveDependency( CheckPointer.class )
.forceCheckPoint( new SimpleTriggerInfo( "MANUAL" ) ) );
try
{
t3.get().waitUntilWaiting( details -> details.isAt( LockWrapper.class, "writeLock" ) );
}
catch ( IllegalStateException e )
{
// Thrown when the fix is in, basically it's thrown if the check pointer didn't get blocked
checkPointer.get(); // to assert that no exception was thrown during in check point thread
}

// Alright the trap is set. Let the population thread move on and seal the deal
controller.release();

// THEN these should complete
applier.get( 10, SECONDS );
checkPointer.get( 10, SECONDS );
success = true;

try ( Transaction tx = db.beginTx() )
{
ConstraintDefinition constraint = single( db.schema().getConstraints( LABEL ) );
assertEquals( KEY, single( constraint.getPropertyKeys() ) );
tx.success();
}

createNode( db, "A" );
try
{
createNode( db, "A" );
fail( "Should have failed" );
}
catch ( ConstraintViolationException e )
{
// THEN good
}
}
finally
{
if ( !success )
{
t2.interrupt();
t3.interrupt();
// so that shutdown won't hang too
}
db.shutdown();
}
}

private void createNode( GraphDatabaseAPI db, String name )
{
try ( Transaction tx = db.beginTx() )
{
db.createNode( LABEL ).setProperty( KEY, name );
tx.success();
}
}

private Future<Object> applyInT2( GraphDatabaseAPI db, List<TransactionRepresentation> transactions )
{
TransactionCommitProcess commitProcess =
db.getDependencyResolver().resolveDependency( TransactionCommitProcess.class );
return t2.execute( state ->
{
transactions.forEach( tx ->
{
try
{
// It will matter if the transactions are supplied all in the same batch or one by one
// since the CountsTracker#apply lock is held and released per transaction
commitProcess.commit( new TransactionToApply( tx ), NULL, EXTERNAL );
}
catch ( TransactionFailureException e )
{
throw new RuntimeException( e );
}
} );
return null;
} );
}

private static List<TransactionRepresentation> createConstraintCreatingTransactions() throws Exception
{
GraphDatabaseAPI db = (GraphDatabaseAPI) new TestGraphDatabaseFactory().newImpermanentDatabase();
try
{
try ( Transaction tx = db.beginTx() )
{
db.schema().constraintFor( LABEL ).assertPropertyIsUnique( KEY ).create();
tx.success();
}

LogicalTransactionStore txStore = db.getDependencyResolver().resolveDependency( LogicalTransactionStore.class );
List<TransactionRepresentation> result = new ArrayList<>();
try ( TransactionCursor cursor = txStore.getTransactions( TransactionIdStore.BASE_TX_ID + 1 ) )
{
while ( cursor.next() )
{
result.add( cursor.get().getTransactionRepresentation() );
}
}
return result;
}
finally
{
db.shutdown();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.Closeable;
import java.io.PrintStream;
import java.lang.Thread.State;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
Expand All @@ -30,11 +31,13 @@
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.locks.LockSupport;
import java.util.function.Predicate;

import org.neo4j.logging.Logger;

import static java.lang.String.format;
import static java.lang.System.currentTimeMillis;
import static java.util.Arrays.asList;
import static java.util.concurrent.Executors.newSingleThreadExecutor;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
Expand Down Expand Up @@ -217,19 +220,47 @@ public String toString()

public WaitDetails waitUntilWaiting() throws TimeoutException
{
return waitUntilThreadState( Thread.State.WAITING, Thread.State.TIMED_WAITING );
return waitUntilWaiting( details -> true );
}

public WaitDetails waitUntilBlocked() throws TimeoutException
{
return waitUntilThreadState( Thread.State.BLOCKED );
return waitUntilBlocked( details -> true );
}

public WaitDetails waitUntilWaiting( Predicate<WaitDetails> correctWait ) throws TimeoutException
{
return waitUntilThreadState( correctWait, Thread.State.WAITING, Thread.State.TIMED_WAITING );
}

public WaitDetails waitUntilBlocked( Predicate<WaitDetails> correctWait ) throws TimeoutException
{
return waitUntilThreadState( correctWait, Thread.State.BLOCKED );
}

public WaitDetails waitUntilThreadState( final Thread.State... possibleStates ) throws TimeoutException
{
return waitUntil( new AnyThreadState( possibleStates ) );
}

public WaitDetails waitUntilThreadState( Predicate<WaitDetails> correctWait,
final Thread.State... possibleStates ) throws TimeoutException
{
long end = currentTimeMillis() + timeout;
WaitDetails details = null;
while ( !correctWait.test( details = waitUntil( new AnyThreadState( possibleStates )) ) )
{
LockSupport.parkNanos( MILLISECONDS.toNanos( 20 ) );
if ( currentTimeMillis() > end )
{
throw new TimeoutException( "Wanted to wait for any of " + Arrays.toString( possibleStates ) +
" over at " + correctWait + ", but didn't managed to get there in " + timeout + "ms. " +
"instead ended up waiting in " + details );
}
}
return details;
}

public WaitDetails waitUntil( Predicate<Thread> condition ) throws TimeoutException
{
long end = System.currentTimeMillis() + timeout;
Expand Down