Skip to content

Commit

Permalink
Don't log tx terminated stack on client disconnect
Browse files Browse the repository at this point in the history
When a client disconnects mid-operation, the bolt server
stack will call "halt" on the state machine. That will, in turn,
terminate any running transaction, and the error from that termination
will propagate up the stack in the worker thread, which will then
try to write the error back to the client.

TL;DR: Any time a connection is closed mid-operation, a massive
stack trace fills up the debug log.

These become reasonably common in production logs, for instance
if users of the Neo4j Browser just close the window in the middle of a query.

The problem isn't *harmless* - it could be indicative of a misconfigured
application that is exiting before a query completes, which might mean
it's losing writes.

Hence, don't get rid of the log message entirely, but replace the big
stack dump with a more helpful `warn` message explaining how to handle.
  • Loading branch information
jakewins committed Mar 16, 2018
1 parent 90ed89a commit 6eb10c8
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 44 deletions.
Expand Up @@ -20,21 +20,28 @@
package org.neo4j.bolt.v1.messaging; package org.neo4j.bolt.v1.messaging;


import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set;


import org.neo4j.bolt.runtime.BoltConnection; import org.neo4j.bolt.runtime.BoltConnection;
import org.neo4j.bolt.v1.packstream.PackOutputClosedException; import org.neo4j.bolt.v1.packstream.PackOutputClosedException;
import org.neo4j.bolt.v1.runtime.BoltResponseHandler; import org.neo4j.bolt.v1.runtime.BoltResponseHandler;
import org.neo4j.bolt.v1.runtime.Neo4jError; import org.neo4j.bolt.v1.runtime.Neo4jError;
import org.neo4j.bolt.v1.runtime.spi.BoltResult; import org.neo4j.bolt.v1.runtime.spi.BoltResult;
import org.neo4j.kernel.api.exceptions.Status;
import org.neo4j.logging.Log; import org.neo4j.logging.Log;
import org.neo4j.values.AnyValue; import org.neo4j.values.AnyValue;
import org.neo4j.values.virtual.MapValue; import org.neo4j.values.virtual.MapValue;
import org.neo4j.values.virtual.VirtualValues; import org.neo4j.values.virtual.VirtualValues;


class MessageProcessingHandler implements BoltResponseHandler class MessageProcessingHandler implements BoltResponseHandler
{ {
// Errors that are expected when the client disconnects mid-operation
private static final Set<Status> CLIENT_MID_OP_DISCONNECT_ERRORS = new HashSet<>( Arrays.asList(
Status.Transaction.Terminated, Status.Transaction.LockClientStopped ) );
protected final Map<String,AnyValue> metadata = new HashMap<>(); protected final Map<String,AnyValue> metadata = new HashMap<>();


protected final Log log; protected final Log log;
Expand Down Expand Up @@ -139,8 +146,23 @@ private void publishError( BoltResponseMessageHandler<IOException> out, Neo4jErr
} }
catch ( PackOutputClosedException e ) catch ( PackOutputClosedException e )
{ {
// we tried to write error back to the client and realized that the underlying channel is closed // Can't write error to the client, because the connection is closed.
// log a warning, client driver might have just been stopped and closed all socket connections // Very likely our error is related to the connection being closed.

// If the error is that the transaction was terminated, then the error is a side-effect of
// us cleaning up stuff that was running when the client disconnected. Log a warning without
// stack trace to highlight clients are disconnecting while stuff is running:
if ( CLIENT_MID_OP_DISCONNECT_ERRORS.contains( error.status() ) )
{
log.warn( "Client %s disconnected while query was running. Session has been cleaned up. " +
"This can be caused by temporary network problems, but if you see this often, " +
"ensure your applications are properly waiting for operations to complete before exiting.",
e.clientAddress() );
return;
}

// If the error isn't that the tx was terminated, log it to the console for debugging. It's likely
// there are other "ok" errors that we can whitelist into the conditional above over time.
log.warn( "Unable to send error back to the client. " + e.getMessage(), error.cause() ); log.warn( "Unable to send error back to the client. " + e.getMessage(), error.cause() );
} }
catch ( Throwable t ) catch ( Throwable t )
Expand Down
Expand Up @@ -23,8 +23,16 @@


public class PackOutputClosedException extends IOException public class PackOutputClosedException extends IOException
{ {
public PackOutputClosedException( String message ) private final String clientAddress;

public PackOutputClosedException( String message, String clientAddress )
{ {
super( message ); super( message );
this.clientAddress = clientAddress;
}

public String clientAddress()
{
return clientAddress;
} }
} }
Expand Up @@ -50,9 +50,9 @@ public class ChunkedOutput implements PackOutput, BoltResponseMessageBoundaryHoo
private final int maxChunkSize; private final int maxChunkSize;
private final AtomicBoolean closed = new AtomicBoolean( false ); private final AtomicBoolean closed = new AtomicBoolean( false );
private final TransportThrottleGroup throttleGroup; private final TransportThrottleGroup throttleGroup;
private final Channel channel;


private ByteBuf buffer; private ByteBuf buffer;
private Channel channel;
private int currentChunkHeaderOffset; private int currentChunkHeaderOffset;


/** Are currently in the middle of writing a chunk? */ /** Are currently in the middle of writing a chunk? */
Expand Down Expand Up @@ -186,7 +186,9 @@ private void ensure( int size ) throws IOException
assert size <= maxChunkSize : size + " > " + maxChunkSize; assert size <= maxChunkSize : size + " > " + maxChunkSize;
if ( closed.get() ) if ( closed.get() )
{ {
throw new PackOutputClosedException( "Network channel towards " + channel.remoteAddress() + " is closed. " + "Client has probably been stopped." ); throw new PackOutputClosedException(
String.format( "Network channel towards %s is closed. Client has probably been stopped.",
channel.remoteAddress() ), String.format( "%s", channel.remoteAddress() ) );
} }
int toWriteSize = chunkOpen ? size : size + CHUNK_HEADER_SIZE; int toWriteSize = chunkOpen ? size : size + CHUNK_HEADER_SIZE;
synchronized ( this ) synchronized ( this )
Expand Down
Expand Up @@ -26,6 +26,7 @@
import org.neo4j.bolt.runtime.BoltConnection; import org.neo4j.bolt.runtime.BoltConnection;
import org.neo4j.bolt.v1.packstream.PackOutputClosedException; import org.neo4j.bolt.v1.packstream.PackOutputClosedException;
import org.neo4j.bolt.v1.runtime.Neo4jError; import org.neo4j.bolt.v1.runtime.Neo4jError;
import org.neo4j.graphdb.TransactionTerminatedException;
import org.neo4j.kernel.api.exceptions.Status; import org.neo4j.kernel.api.exceptions.Status;
import org.neo4j.logging.AssertableLogProvider; import org.neo4j.logging.AssertableLogProvider;
import org.neo4j.logging.Log; import org.neo4j.logging.Log;
Expand All @@ -49,14 +50,12 @@ public void shouldCallHaltOnUnexpectedFailures() throws Exception
{ {
// Given // Given
BoltResponseMessageHandler<IOException> msgHandler = newResponseHandlerMock(); BoltResponseMessageHandler<IOException> msgHandler = newResponseHandlerMock();
doThrow( new RuntimeException( "Something went horribly wrong" ) ) doThrow( new RuntimeException( "Something went horribly wrong" ) ).when( msgHandler ).onSuccess(
.when( msgHandler ) any( MapValue.class ) );
.onSuccess( any( MapValue.class ) );


BoltConnection connection = mock( BoltConnection.class ); BoltConnection connection = mock( BoltConnection.class );
MessageProcessingHandler handler = MessageProcessingHandler handler =
new MessageProcessingHandler( msgHandler, mock( Runnable.class ), new MessageProcessingHandler( msgHandler, mock( Runnable.class ), connection, mock( Log.class ) );
connection, mock( Log.class ) );


// When // When
handler.onFinish(); handler.onFinish();
Expand All @@ -68,70 +67,84 @@ public void shouldCallHaltOnUnexpectedFailures() throws Exception
@Test @Test
public void shouldLogOriginalErrorWhenOutputIsClosed() throws Exception public void shouldLogOriginalErrorWhenOutputIsClosed() throws Exception
{ {
testLoggingOfOriginalErrorWhenOutputIsClosed( false ); testLoggingOfOriginalErrorWhenOutputIsClosed( Neo4jError.from( new RuntimeException( "Non-fatal error" ) ) );
} }


@Test @Test
public void shouldLogOriginalFatalErrorWhenOutputIsClosed() throws Exception public void shouldLogOriginalFatalErrorWhenOutputIsClosed() throws Exception
{ {
testLoggingOfOriginalErrorWhenOutputIsClosed( true ); testLoggingOfOriginalErrorWhenOutputIsClosed( Neo4jError.fatalFrom( new RuntimeException( "Fatal error" ) ) );
} }


@Test @Test
public void shouldLogWriteErrorAndOriginalErrorWhenUnknownFailure() throws Exception public void shouldLogWriteErrorAndOriginalErrorWhenUnknownFailure() throws Exception
{ {
testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure( false ); testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure(
Neo4jError.from( new RuntimeException( "Non-fatal error" ) ) );
} }


@Test @Test
public void shouldLogWriteErrorAndOriginalFatalErrorWhenUnknownFailure() throws Exception public void shouldLogWriteErrorAndOriginalFatalErrorWhenUnknownFailure() throws Exception
{ {
testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure( true ); testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure(
Neo4jError.fatalFrom( new RuntimeException( "Fatal error" ) ) );
} }


private static void testLoggingOfOriginalErrorWhenOutputIsClosed( boolean fatalError ) throws Exception @Test
public void shouldLogShortWarningOnClientDisconnectMidwayThroughQuery() throws Exception
{ {
AssertableLogProvider logProvider = new AssertableLogProvider(); // Connections dying is not exceptional per-se, so we don't need to fill the log with
Log log = logProvider.getLog( "Test" ); // eye-catching stack traces; but it could be indicative of some issue, so log a brief

// warning in the debug log at least.
PackOutputClosedException outputClosed = new PackOutputClosedException( "Output closed" );
BoltResponseMessageHandler<IOException> responseHandler = newResponseHandlerMock( fatalError, outputClosed );


MessageProcessingHandler handler = new MessageProcessingHandler( responseHandler, mock( Runnable.class ), // Given
mock( BoltConnection.class ), log ); PackOutputClosedException outputClosed = new PackOutputClosedException( "Output closed", "<client>" );
Neo4jError txTerminated =
Neo4jError.from( new TransactionTerminatedException( Status.Transaction.Terminated ) );


RuntimeException originalError = new RuntimeException( "Hi, I'm the original error" ); // When
markFailed( handler, fatalError, originalError ); AssertableLogProvider logProvider = emulateFailureWritingError( txTerminated, outputClosed );


logProvider.assertExactly( inLog( "Test" ).warn( // Then
startsWith( "Unable to send error back to the client" ), logProvider.assertExactly( inLog( "Test" ).warn( equalTo(
equalTo( originalError ) ) ); "Client %s disconnected while query was running. Session has been cleaned up. " +
"This can be caused by temporary network problems, but if you see this often, ensure your " +
"applications are properly waiting for operations to complete before exiting." ),
equalTo( "<client>" ) ) );
} }


private static void testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure( boolean fatalError ) throws Exception private static void testLoggingOfOriginalErrorWhenOutputIsClosed( Neo4jError original ) throws Exception
{ {
AssertableLogProvider logProvider = new AssertableLogProvider(); PackOutputClosedException outputClosed = new PackOutputClosedException( "Output closed", "<client>" );
Log log = logProvider.getLog( "Test" ); AssertableLogProvider logProvider = emulateFailureWritingError( original, outputClosed );
logProvider.assertExactly( inLog( "Test" ).warn( startsWith( "Unable to send error back to the client" ),
equalTo( original.cause() ) ) );
}


private static void testLoggingOfWriteErrorAndOriginalErrorWhenUnknownFailure( Neo4jError original )
throws Exception
{
RuntimeException outputError = new RuntimeException( "Output failed" ); RuntimeException outputError = new RuntimeException( "Output failed" );
BoltResponseMessageHandler<IOException> responseHandler = newResponseHandlerMock( fatalError, outputError ); AssertableLogProvider logProvider = emulateFailureWritingError( original, outputError );

logProvider.assertExactly( inLog( "Test" ).error( startsWith( "Unable to send error back to the client" ),
MessageProcessingHandler handler = new MessageProcessingHandler( responseHandler, mock( Runnable.class ), both( equalTo( outputError ) ).and( hasSuppressed( original.cause() ) ) ) );
mock( BoltConnection.class ), log );

RuntimeException originalError = new RuntimeException( "Hi, I'm the original error" );
markFailed( handler, fatalError, originalError );

logProvider.assertExactly( inLog( "Test" ).error(
startsWith( "Unable to send error back to the client" ),
both( equalTo( outputError ) ).and( hasSuppressed( originalError ) ) ) );
} }


private static void markFailed( MessageProcessingHandler handler, boolean fatalError, Throwable error ) private static AssertableLogProvider emulateFailureWritingError( Neo4jError error, Throwable errorDuringWrite )
throws Exception
{ {
Neo4jError neo4jError = fatalError ? Neo4jError.fatalFrom( error ) : Neo4jError.from( error ); AssertableLogProvider logProvider = new AssertableLogProvider();
handler.markFailed( neo4jError ); BoltResponseMessageHandler<IOException> responseHandler =
newResponseHandlerMock( error.isFatal(), errorDuringWrite );

MessageProcessingHandler handler =
new MessageProcessingHandler( responseHandler, mock( Runnable.class ), mock( BoltConnection.class ),
logProvider.getLog( "Test" ) );

handler.markFailed( error );
handler.onFinish(); handler.onFinish();

return logProvider;
} }


private static BoltResponseMessageHandler<IOException> newResponseHandlerMock( boolean fatalError, Throwable error ) private static BoltResponseMessageHandler<IOException> newResponseHandlerMock( boolean fatalError, Throwable error )
Expand Down

0 comments on commit 6eb10c8

Please sign in to comment.