Skip to content

Commit

Permalink
Add setting unsupported.dbms.tx_log.fail_on_corrupted_log_files
Browse files Browse the repository at this point in the history
This settings is true by default and will tell the database to shutdown
if any errors in the transaction logs are encountered. If set to false,
the database will try to recover as much as possible and remove the
corrupted part of the transaction log. This might leave the database in
an inconsistent state and is therefor and opt-in and not on by default.
  • Loading branch information
klaren authored and MishaDemianenko committed Jul 23, 2018
1 parent 3332eca commit 93fd175
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 30 deletions.
Expand Up @@ -602,6 +602,12 @@ public String providerName()
public static final Setting<Long> logical_log_rotation_threshold =
buildSetting( "dbms.tx_log.rotation.size", BYTES, "250M" ).constraint( min( ByteUnit.mebiBytes( 1 ) ) ).build();

@Description( "If `true`, Neo4j will abort recovery if any errors are encountered in the logical log. Setting " +
"this to `false` will allow Neo4j to restore as much as possible from the corrupted log files and ignore " +
"the rest, but, the integrity of the database might be compromised." )
@Internal
public static final Setting<Boolean> fail_on_corrupted_log_files = setting("unsupported.dbms.tx_log.fail_on_corrupted_log_files", BOOLEAN, TRUE );

@Description( "Use a quick approach for rebuilding the ID generators. This give quicker recovery time, " +
"but will limit the ability to reuse the space of deleted entities." )
@Internal
Expand Down
Expand Up @@ -160,7 +160,6 @@
import org.neo4j.storageengine.api.StoreFileMetadata;
import org.neo4j.storageengine.api.StoreReadLayer;
import org.neo4j.time.SystemNanoClock;
import org.neo4j.util.FeatureToggles;

import static org.neo4j.helpers.Exceptions.throwIfUnchecked;

Expand Down Expand Up @@ -224,8 +223,6 @@ boolean applicable( DiagnosticsPhase phase )
}

public static final String DEFAULT_DATA_SOURCE_NAME = "nioneodb";
private final boolean failOnCorruptedLogFiles = FeatureToggles.flag( NeoStoreDataSource.class,
"failOnCorruptedLogFiles", false );

private final Monitors monitors;
private final Tracers tracers;
Expand Down Expand Up @@ -280,6 +277,8 @@ boolean applicable( DiagnosticsPhase phase )
private NeoStoreTransactionLogModule transactionLogModule;
private NeoStoreKernelModule kernelModule;

private final boolean failOnCorruptedLogFiles;

public NeoStoreDataSource( File storeDir, Config config, IdGeneratorFactory idGeneratorFactory,
LogService logService, JobScheduler scheduler, TokenNameLookup tokenNameLookup,
DependencyResolver dependencyResolver, PropertyKeyTokenHolder propertyKeyTokens,
Expand Down Expand Up @@ -363,6 +362,7 @@ public Iterable<IndexImplementation> all()
this.pageCache = pageCache;
this.monitors.addMonitorListener( new LoggingLogFileMonitor( msgLog ) );
this.collectionsFactorySupplier = collectionsFactorySupplier;
this.failOnCorruptedLogFiles = config.get( GraphDatabaseSettings.fail_on_corrupted_log_files );
}

@Override
Expand Down
Expand Up @@ -20,6 +20,7 @@
package org.neo4j.kernel.recovery;

import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;

import org.neo4j.kernel.impl.store.UnderlyingStorageException;
import org.neo4j.kernel.impl.transaction.log.LogEntryCursor;
Expand All @@ -37,8 +38,8 @@
import org.neo4j.kernel.impl.transaction.log.files.LogFiles;
import org.neo4j.kernel.monitoring.Monitors;

import static org.neo4j.helpers.Exceptions.throwIfUnchecked;
import static org.neo4j.kernel.impl.transaction.log.LogVersionRepository.INITIAL_LOG_VERSION;
import static org.neo4j.kernel.recovery.Recovery.throwUnableToCleanRecover;

/**
* This class collects information about the latest entries in the transaction log. Since the only way we have to collect
Expand Down Expand Up @@ -132,14 +133,18 @@ else if ( entry instanceof LogEntryStart )
{
corruptedTransactionLogs = true;
}
}
catch ( Error | ClosedByInterruptException e )
{
// These should not be parsing errors
throw e;
}
catch ( Throwable t )
{
monitor.corruptedLogFile( version, t );
if ( failOnCorruptedLogFiles )
{
throwIfUnchecked( t );
throw new RuntimeException( t );
throwUnableToCleanRecover( t );
}
corruptedTransactionLogs = true;
}
Expand Down
Expand Up @@ -20,7 +20,9 @@
package org.neo4j.kernel.recovery;

import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;

import org.neo4j.graphdb.factory.GraphDatabaseSettings;
import org.neo4j.kernel.impl.core.StartupStatisticsProvider;
import org.neo4j.kernel.impl.transaction.CommittedTransactionRepresentation;
import org.neo4j.kernel.impl.transaction.log.LogPosition;
Expand All @@ -30,7 +32,6 @@
import org.neo4j.kernel.impl.util.monitoring.ProgressReporter;
import org.neo4j.kernel.lifecycle.LifecycleAdapter;

import static org.neo4j.helpers.Exceptions.throwIfUnchecked;
import static org.neo4j.storageengine.api.TransactionApplicationMode.RECOVERY;
import static org.neo4j.storageengine.api.TransactionApplicationMode.REVERSE_RECOVERY;

Expand Down Expand Up @@ -116,12 +117,17 @@ public void init() throws IOException
recoveryToPosition = transactionsToRecover.position();
}
}
catch ( Error | ClosedByInterruptException e )
{
// We do not want to truncate logs based on these exceptions. Since users can influence them with config changes
// the users are able to workaround this if truncations is really needed.
throw e;
}
catch ( Throwable t )
{
if ( failOnCorruptedLogFiles )
{
throwIfUnchecked( t );
throw new RuntimeException( t );
throwUnableToCleanRecover( t );
}
if ( lastTransaction != null )
{
Expand All @@ -142,6 +148,15 @@ public void init() throws IOException
monitor.recoveryCompleted( numberOfRecoveredTransactions );
}

static void throwUnableToCleanRecover( Throwable t )
{
throw new RuntimeException(
"Error reading transaction logs, recovery not possible. To force the database to start anyway, you can specify '" +
GraphDatabaseSettings.fail_on_corrupted_log_files.name() + "=false'. This will try to recover as much " +
"as possible and then truncate the corrupt part of the transaction log. Doing this means your database " +
"integrity might be compromised, please consider restoring from a consistent backup instead.", t );
}

private void initProgressReporter( RecoveryStartInformation recoveryStartInformation,
CommittedTransactionRepresentation lastReversedTransaction )
{
Expand Down
Expand Up @@ -20,7 +20,6 @@
package org.neo4j.kernel;

import org.hamcrest.Matchers;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
Expand All @@ -40,6 +39,7 @@
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.factory.GraphDatabaseSettings;
import org.neo4j.helpers.collection.MultiSet;
import org.neo4j.io.fs.OpenMode;
import org.neo4j.io.fs.StoreChannel;
Expand Down Expand Up @@ -79,7 +79,6 @@
import org.neo4j.test.rule.RandomRule;
import org.neo4j.test.rule.TestDirectory;
import org.neo4j.test.rule.fs.DefaultFileSystemRule;
import org.neo4j.util.FeatureToggles;

import static org.hamcrest.Matchers.emptyArray;
import static org.hamcrest.Matchers.greaterThan;
Expand Down Expand Up @@ -114,12 +113,6 @@ public void setUp() throws Exception
logFiles = buildDefaultLogFiles();
}

@After
public void tearDown()
{
FeatureToggles.set( NeoStoreDataSource.class, "failOnCorruptedLogFiles", false );
}

@Test
public void evenTruncateNewerTransactionLogFile() throws IOException
{
Expand All @@ -136,7 +129,8 @@ public void evenTruncateNewerTransactionLogFile() throws IOException
removeLastCheckpointRecordFromLastLogFile();
addRandomBytesToLastLogFile( this::randomBytes );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

assertEquals( numberOfClosedTransactions, recoveryMonitor.getNumberOfRecoveredTransactions() );
Expand All @@ -154,7 +148,6 @@ public void doNotTruncateNewerTransactionLogFileWhenFailOnError() throws IOExcep
removeLastCheckpointRecordFromLastLogFile();
addRandomBytesToLastLogFile( this::randomPositiveBytes );

FeatureToggles.set( NeoStoreDataSource.class, "failOnCorruptedLogFiles", true );
expectedException.expectCause( new RootCauseMatcher<>( UnsupportedLogVersionException.class ) );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
Expand All @@ -176,7 +169,8 @@ public void truncateNewerTransactionLogFileWhenForced() throws IOException
removeLastCheckpointRecordFromLastLogFile();
addRandomBytesToLastLogFile( this::randomBytes );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 0." );
Expand All @@ -189,7 +183,8 @@ public void recoverFirstCorruptedTransactionSingleFileNoCheckpoint() throws IOEx
{
addCorruptedCommandsToLastLogFile();

GraphDatabaseService recoveredDatabase = databaseFactory.newEmbeddedDatabase( storeDir );
GraphDatabaseService recoveredDatabase = databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
recoveredDatabase.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 0." );
Expand All @@ -210,7 +205,6 @@ public void failToRecoverFirstCorruptedTransactionSingleFileNoCheckpointIfFailOn
{
addCorruptedCommandsToLastLogFile();

FeatureToggles.set( NeoStoreDataSource.class, "failOnCorruptedLogFiles", true );
expectedException.expectCause( new RootCauseMatcher<>( NegativeArraySizeException.class ) );

GraphDatabaseService recoveredDatabase = databaseFactory.newEmbeddedDatabase( storeDir );
Expand Down Expand Up @@ -239,7 +233,8 @@ public void recoverNotAFirstCorruptedTransactionSingleFileNoCheckpoint() throws

assertThat( modifiedFileLength, greaterThan( originalFileLength ) );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 0." );
Expand Down Expand Up @@ -280,7 +275,8 @@ public void recoverNotAFirstCorruptedTransactionMultipleFilesNoCheckpoints() thr

assertThat( modifiedFileLength, greaterThan( originalFileLength ) );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 3." );
Expand Down Expand Up @@ -318,7 +314,8 @@ public void recoverNotAFirstCorruptedTransactionMultipleFilesMultipleCheckpoints

assertThat( modifiedFileLength, greaterThan( originalFileLength ) );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 3." );
Expand Down Expand Up @@ -349,7 +346,8 @@ public void recoverFirstCorruptedTransactionAfterCheckpointInLastLogFile() throw

assertThat( modifiedFileLength, greaterThan( originalFileLength ) );

database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabase( storeDir );
database = (GraphDatabaseAPI) databaseFactory.newEmbeddedDatabaseBuilder( storeDir )
.setConfig( GraphDatabaseSettings.fail_on_corrupted_log_files, "false" ).newGraphDatabase();
database.shutdown();

logProvider.assertContainsMessageContaining( "Fail to read transaction log version 5." );
Expand Down Expand Up @@ -438,7 +436,7 @@ private void removeLastCheckpointRecordFromLastLogFile() throws IOException
LogPosition checkpointPosition = null;

LogFile transactionLogFile = logFiles.getLogFile();
VersionAwareLogEntryReader entryReader = new VersionAwareLogEntryReader();
VersionAwareLogEntryReader<ReadableLogChannel> entryReader = new VersionAwareLogEntryReader<>();
LogPosition startPosition = LogPosition.start( logFiles.getHighestLogVersion() );
try ( ReadableLogChannel reader = transactionLogFile.getReader( startPosition ) )
{
Expand Down Expand Up @@ -527,7 +525,7 @@ private MultiSet<Class> getLogEntriesDistribution( LogFiles logFiles ) throws IO
LogFile transactionLogFile = logFiles.getLogFile();

LogPosition fileStartPosition = new LogPosition( 0, LogHeader.LOG_HEADER_SIZE );
VersionAwareLogEntryReader entryReader = new VersionAwareLogEntryReader();
VersionAwareLogEntryReader<ReadableLogChannel> entryReader = new VersionAwareLogEntryReader<>();

MultiSet<Class> multiset = new MultiSet<>();
try ( ReadableLogChannel fileReader = transactionLogFile.getReader( fileStartPosition ) )
Expand Down
@@ -1,3 +1,25 @@
/*
* Copyright (c) 2002-2018 "Neo4j,"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j Enterprise Edition. The included source
* code can be redistributed and/or modified under the terms of the
* GNU AFFERO GENERAL PUBLIC LICENSE Version 3
* (http://www.fsf.org/licensing/licenses/agpl-3.0.html) with the
* Commons Clause, as found in the associated LICENSE.txt file.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* Neo4j object code can be licensed independently from the source
* under separate terms from the AGPL. Inquiries can be directed to:
* licensing@neo4j.com
*
* More information is also available at:
* https://neo4j.com/licensing/
*/
package org.neo4j.causalclustering.discovery;

import org.junit.Test;
Expand All @@ -17,7 +39,7 @@

public class HazelcastCoreTopologyServiceTest
{
@Test( timeout = 120_000)
@Test( timeout = 120_000 )
public void shouldBeAbleToStartAndStoreWithoutSuccessfulJoin()
{
CentralJobScheduler jobScheduler = new CentralJobScheduler();
Expand All @@ -36,7 +58,6 @@ public void shouldBeAbleToStartAndStoreWithoutSuccessfulJoin()
hostnameResolver,
new TopologyServiceNoRetriesStrategy() );
service.start();
Thread.yield();
service.stop();
}

Expand Down

0 comments on commit 93fd175

Please sign in to comment.