Skip to content

Commit

Permalink
Merge pull request #6418 from jakewins/3.0-utf8-garbage
Browse files Browse the repository at this point in the history
Introduce a custom UTF8 encoder if sun.misc tools are available.
  • Loading branch information
Zhen Li committed Feb 19, 2016
2 parents c1ad817 + d051a20 commit a0e28c9
Show file tree
Hide file tree
Showing 7 changed files with 313 additions and 141 deletions.
Expand Up @@ -20,8 +20,11 @@
package org.neo4j.bolt.v1.packstream;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

import org.neo4j.bolt.v1.packstream.utf8.UTF8Encoder;

/**
* PackStream is a messaging serialisation format heavily inspired by MessagePack.
* The key differences are in the type system itself which (among other things) replaces extensions with structures.
Expand Down Expand Up @@ -162,6 +165,7 @@ private PackStream()
public static class Packer
{
private PackOutput out;
private UTF8Encoder utf8 = UTF8Encoder.fastestAvailableEncoder();

public Packer( PackOutput out )
{
Expand All @@ -173,11 +177,6 @@ public void flush() throws IOException
out.flush();
}

private void packRaw( byte[] data ) throws IOException
{
out.writeBytes( data, 0, data.length );
}

public void packNull() throws IOException
{
out.writeByte( NULL );
Expand Down Expand Up @@ -217,50 +216,14 @@ public void pack( double value ) throws IOException
out.writeByte( FLOAT_64 ).writeDouble( value );
}

public void pack( byte[] values ) throws IOException
{
if ( values == null ) { packNull(); }
else
{
packBytesHeader( values.length );
packRaw( values );
}
}

public void pack( String value ) throws IOException
{
if ( value == null ) { packNull(); }
else
{
byte[] utf8 = value.getBytes( StandardCharsets.UTF_8 );
packStringHeader( utf8.length );
packRaw( utf8 );
}
}

public void packString( byte[] utf8 ) throws IOException
{
if ( utf8 == null ) { packNull(); }
else
{
packStringHeader( utf8.length );
packRaw( utf8 );
}
}

private void packBytesHeader( int size ) throws IOException
{
if ( size <= Byte.MAX_VALUE )
{
out.writeShort( (short) (BYTES_8 << 8 | (byte) size) );
}
else if ( size <= Short.MAX_VALUE )
{
out.writeByte( BYTES_16 ).writeShort( (short) size );
}
else
{
out.writeByte( BYTES_32 ).writeInt( size );
ByteBuffer encoded = utf8.encode( value );
packStringHeader( encoded.remaining() );
out.writeBytes( encoded );
}
}

Expand Down Expand Up @@ -501,46 +464,6 @@ public String unpackString() throws IOException
return new String( unpackUTF8(), StandardCharsets.UTF_8 );
}

private int unpackBytesHeader() throws IOException
{
final byte markerByte = in.readByte();

int size;

switch ( markerByte )
{
case BYTES_8:
size = unpackUINT8();
break;
case BYTES_16:
size = unpackUINT16();
break;
case BYTES_32:
{
long longSize = unpackUINT32();
if ( longSize <= Integer.MAX_VALUE )
{
size = (int) longSize;
}
else
{
throw new Overflow( "BYTES_32 too long for Java" );
}
break;
}
default:
throw new Unexpected( PackType.BYTES, markerByte);
}

return size;
}

public byte[] unpackBytes() throws IOException
{
int size = unpackBytesHeader();
return unpackRawBytes( size );
}

public int unpackStringHeader() throws IOException
{
final byte markerByte = in.readByte();
Expand Down
Expand Up @@ -51,7 +51,10 @@ public PackOutput writeByte( byte value ) throws IOException
@Override
public PackOutput writeBytes( ByteBuffer buffer ) throws IOException
{
data.write( buffer.array() );
while(buffer.remaining() > 0)
{
data.writeByte( buffer.get() );
}
return this;
}

Expand Down
@@ -0,0 +1,145 @@
/*
* Copyright (c) 2002-2016 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.bolt.v1.packstream.utf8;

import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.reflect.Field;
import java.nio.ByteBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;

import static org.neo4j.unsafe.impl.internal.dragons.FeatureToggles.getInteger;

/**
* This is a specialized UTF-8 encoder that solves two predicaments:
*
* 1) There's no way using public APIs to do GC-free string encoding unless
* you build a custom encoder, and GC output from UTF-8 encoding causes
* production instability
* 2) The ArrayEncoder provided by HotSpot is 2 orders faster for UTF-8 encoding
* for a massive amount of real-world strings due to specialized handling of
* ascii, and we can't import that since we need to compile on IBM J9
*
* We can't solve (1) without solving (2), because the default GC-spewing String#getBytes()
* uses the optimized ArrayEncoder, meaning it's easy to write an encoder that
* is GC-free, but then it'll be two orders slower than the stdlib, and vice
* versa.
*
* This solves both issues using MethodHandles. Future work here could include
* writing a custom UTF-8 encoder (which could then avoid using ArrayEncoder),
* as well as stopping use of String's for the main database paths.
* We already have Token, which
* could easily contain pre-encoded UTF-8 data, and "runtime" Strings could be
* handled with a custom type that is more stability friendly, for instance
* by building on to StringProperty.
*/
public class SunMiscUTF8Encoder implements UTF8Encoder
{
private static final int BUFFER_SIZE = getInteger( SunMiscUTF8Encoder.class, "buffer_size", 1024*16 );
private static final int fallbackAtStringLength =
(int)(BUFFER_SIZE / StandardCharsets.UTF_8.newEncoder().averageBytesPerChar());
private static final MethodHandle getCharArray = charArrayGetter();
private static final MethodHandle arrayEncode = arrayEncode();

private final CharsetEncoder charsetEncoder = StandardCharsets.UTF_8.newEncoder();

private final byte[] out = new byte[BUFFER_SIZE];
private final ByteBuffer outBuf = ByteBuffer.wrap( out );
private final UTF8Encoder fallbackEncoder = new VanillaUTF8Encoder();

@Override
public ByteBuffer encode( String input )
{
try
{
// If it's unlikely we will fit the encoded data, just use stdlib encoder
if( input.length() > fallbackAtStringLength )
{
return fallbackEncoder.encode( input );
}

char[] rawChars = (char[]) getCharArray.invoke( input );
int len = (int)arrayEncode.invoke( charsetEncoder, rawChars, 0, rawChars.length, out );

if( len == -1 )
{
return fallbackEncoder.encode( input );
}

outBuf.position(0);
outBuf.limit(len);
return outBuf;
}
catch( ArrayIndexOutOfBoundsException e )
{
// This happens when we can't fit the encoded string.
// We try and avoid this altogether by falling back to the
// vanilla encoder if the string looks like it'll not fit -
// but this is probabilistic since we don't know until we've encoded.
// So, if our guess is wrong, we fall back here instead.
return fallbackEncoder.encode( input );
}
catch ( Throwable e )
{
throw new AssertionError( "This encoder depends on sun.nio.cs.ArrayEncoder, which failed to load: " +
e.getMessage(), e );
}
}

private static MethodHandle arrayEncode()
{
// Because we need to be able to compile on IBM's JVM, we can't
// depend on ArrayEncoder. Unfortunately, ArrayEncoders encode method
// is twoish orders of magnitude faster than regular encoders for ascii
// so we go through the hurdle of calling that encode method via
// a MethodHandle.
MethodHandles.Lookup lookup = MethodHandles.lookup();
try
{

return lookup.unreflect( Class.forName( "sun.nio.cs.ArrayEncoder" )
.getMethod( "encode", char[].class, int.class, int.class, byte[].class ) );
}
catch ( Throwable e )
{
throw new AssertionError(
"This encoder depends on sun.nio.cs.ArrayEncoder, which failed to load: " +
e.getMessage(), e );
}
}

private static MethodHandle charArrayGetter()
{
MethodHandles.Lookup lookup = MethodHandles.lookup();
try
{
Field value = String.class.getDeclaredField( "value" );
value.setAccessible( true );
return lookup.unreflectGetter( value );
}
catch ( Throwable e )
{
throw new AssertionError(
"This encoder depends being able to access raw char[] in java.lang.String, which failed: " +
e.getMessage(), e );
}
}
}
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2002-2016 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.bolt.v1.packstream.utf8;

import java.nio.ByteBuffer;

/**
* A non-thread-safe UTF8 encoding interface, delegates to near-zero GC overhead
* UTF8 implementations on HotSpot, falls back to stdlib encoder if on other JVM.
*
* This implementation solves a major GC bottleneck in that we don't have to
* allocate objects to encode most strings.
*
* We currently do "bulk" encoding, where the whole string is turned
* into UTF-8 before it gets returned. This is simply a limitation in
* PackStream currently in that we need to know the length of utf-8
* strings up-front, so we can't stream them out.
*
* This becomes an issue for very large strings, and should be remedied
* in Bolt V2 by introducing streaming options for Strings in the same
* manner we've discussed adding streaming lists.
*
* Once that is resolved, we could have a method here that took a
* WritableByteChannel or similar instead.
*/
public interface UTF8Encoder
{
/**
* @return a ByteBuffer with the encoded string. This will be overwritten
* the next time you call this method, so use it or loose it!
*/
ByteBuffer encode( String input );

static UTF8Encoder fastestAvailableEncoder()
{
try
{
return (UTF8Encoder)Class
.forName("org.neo4j.bolt.v1.packstream.utf8.SunMiscUTF8Encoder")
.getConstructor()
.newInstance();
}
catch ( Exception e )
{
return new VanillaUTF8Encoder();
}
}


}
@@ -0,0 +1,35 @@
/*
* Copyright (c) 2002-2016 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.bolt.v1.packstream.utf8;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

/**
* Simple encoder that delegates to String#getBytes()
*/
public class VanillaUTF8Encoder implements UTF8Encoder
{
@Override
public ByteBuffer encode( String input )
{
return ByteBuffer.wrap( input.getBytes( StandardCharsets.UTF_8 ) );
}
}

0 comments on commit a0e28c9

Please sign in to comment.