-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6418 from jakewins/3.0-utf8-garbage
Introduce a custom UTF8 encoder if sun.misc tools are available.
- Loading branch information
Showing
7 changed files
with
313 additions
and
141 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
145 changes: 145 additions & 0 deletions
145
community/bolt/src/main/java/org/neo4j/bolt/v1/packstream/utf8/SunMiscUTF8Encoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
/* | ||
* Copyright (c) 2002-2016 "Neo Technology," | ||
* Network Engine for Objects in Lund AB [http://neotechnology.com] | ||
* | ||
* This file is part of Neo4j. | ||
* | ||
* Neo4j is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
package org.neo4j.bolt.v1.packstream.utf8; | ||
|
||
import java.lang.invoke.MethodHandle; | ||
import java.lang.invoke.MethodHandles; | ||
import java.lang.reflect.Field; | ||
import java.nio.ByteBuffer; | ||
import java.nio.charset.CharsetEncoder; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
import static org.neo4j.unsafe.impl.internal.dragons.FeatureToggles.getInteger; | ||
|
||
/** | ||
* This is a specialized UTF-8 encoder that solves two predicaments: | ||
* | ||
* 1) There's no way using public APIs to do GC-free string encoding unless | ||
* you build a custom encoder, and GC output from UTF-8 encoding causes | ||
* production instability | ||
* 2) The ArrayEncoder provided by HotSpot is 2 orders faster for UTF-8 encoding | ||
* for a massive amount of real-world strings due to specialized handling of | ||
* ascii, and we can't import that since we need to compile on IBM J9 | ||
* | ||
* We can't solve (1) without solving (2), because the default GC-spewing String#getBytes() | ||
* uses the optimized ArrayEncoder, meaning it's easy to write an encoder that | ||
* is GC-free, but then it'll be two orders slower than the stdlib, and vice | ||
* versa. | ||
* | ||
* This solves both issues using MethodHandles. Future work here could include | ||
* writing a custom UTF-8 encoder (which could then avoid using ArrayEncoder), | ||
* as well as stopping use of String's for the main database paths. | ||
* We already have Token, which | ||
* could easily contain pre-encoded UTF-8 data, and "runtime" Strings could be | ||
* handled with a custom type that is more stability friendly, for instance | ||
* by building on to StringProperty. | ||
*/ | ||
public class SunMiscUTF8Encoder implements UTF8Encoder | ||
{ | ||
private static final int BUFFER_SIZE = getInteger( SunMiscUTF8Encoder.class, "buffer_size", 1024*16 ); | ||
private static final int fallbackAtStringLength = | ||
(int)(BUFFER_SIZE / StandardCharsets.UTF_8.newEncoder().averageBytesPerChar()); | ||
private static final MethodHandle getCharArray = charArrayGetter(); | ||
private static final MethodHandle arrayEncode = arrayEncode(); | ||
|
||
private final CharsetEncoder charsetEncoder = StandardCharsets.UTF_8.newEncoder(); | ||
|
||
private final byte[] out = new byte[BUFFER_SIZE]; | ||
private final ByteBuffer outBuf = ByteBuffer.wrap( out ); | ||
private final UTF8Encoder fallbackEncoder = new VanillaUTF8Encoder(); | ||
|
||
@Override | ||
public ByteBuffer encode( String input ) | ||
{ | ||
try | ||
{ | ||
// If it's unlikely we will fit the encoded data, just use stdlib encoder | ||
if( input.length() > fallbackAtStringLength ) | ||
{ | ||
return fallbackEncoder.encode( input ); | ||
} | ||
|
||
char[] rawChars = (char[]) getCharArray.invoke( input ); | ||
int len = (int)arrayEncode.invoke( charsetEncoder, rawChars, 0, rawChars.length, out ); | ||
|
||
if( len == -1 ) | ||
{ | ||
return fallbackEncoder.encode( input ); | ||
} | ||
|
||
outBuf.position(0); | ||
outBuf.limit(len); | ||
return outBuf; | ||
} | ||
catch( ArrayIndexOutOfBoundsException e ) | ||
{ | ||
// This happens when we can't fit the encoded string. | ||
// We try and avoid this altogether by falling back to the | ||
// vanilla encoder if the string looks like it'll not fit - | ||
// but this is probabilistic since we don't know until we've encoded. | ||
// So, if our guess is wrong, we fall back here instead. | ||
return fallbackEncoder.encode( input ); | ||
} | ||
catch ( Throwable e ) | ||
{ | ||
throw new AssertionError( "This encoder depends on sun.nio.cs.ArrayEncoder, which failed to load: " + | ||
e.getMessage(), e ); | ||
} | ||
} | ||
|
||
private static MethodHandle arrayEncode() | ||
{ | ||
// Because we need to be able to compile on IBM's JVM, we can't | ||
// depend on ArrayEncoder. Unfortunately, ArrayEncoders encode method | ||
// is twoish orders of magnitude faster than regular encoders for ascii | ||
// so we go through the hurdle of calling that encode method via | ||
// a MethodHandle. | ||
MethodHandles.Lookup lookup = MethodHandles.lookup(); | ||
try | ||
{ | ||
|
||
return lookup.unreflect( Class.forName( "sun.nio.cs.ArrayEncoder" ) | ||
.getMethod( "encode", char[].class, int.class, int.class, byte[].class ) ); | ||
} | ||
catch ( Throwable e ) | ||
{ | ||
throw new AssertionError( | ||
"This encoder depends on sun.nio.cs.ArrayEncoder, which failed to load: " + | ||
e.getMessage(), e ); | ||
} | ||
} | ||
|
||
private static MethodHandle charArrayGetter() | ||
{ | ||
MethodHandles.Lookup lookup = MethodHandles.lookup(); | ||
try | ||
{ | ||
Field value = String.class.getDeclaredField( "value" ); | ||
value.setAccessible( true ); | ||
return lookup.unreflectGetter( value ); | ||
} | ||
catch ( Throwable e ) | ||
{ | ||
throw new AssertionError( | ||
"This encoder depends being able to access raw char[] in java.lang.String, which failed: " + | ||
e.getMessage(), e ); | ||
} | ||
} | ||
} |
67 changes: 67 additions & 0 deletions
67
community/bolt/src/main/java/org/neo4j/bolt/v1/packstream/utf8/UTF8Encoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Copyright (c) 2002-2016 "Neo Technology," | ||
* Network Engine for Objects in Lund AB [http://neotechnology.com] | ||
* | ||
* This file is part of Neo4j. | ||
* | ||
* Neo4j is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
package org.neo4j.bolt.v1.packstream.utf8; | ||
|
||
import java.nio.ByteBuffer; | ||
|
||
/** | ||
* A non-thread-safe UTF8 encoding interface, delegates to near-zero GC overhead | ||
* UTF8 implementations on HotSpot, falls back to stdlib encoder if on other JVM. | ||
* | ||
* This implementation solves a major GC bottleneck in that we don't have to | ||
* allocate objects to encode most strings. | ||
* | ||
* We currently do "bulk" encoding, where the whole string is turned | ||
* into UTF-8 before it gets returned. This is simply a limitation in | ||
* PackStream currently in that we need to know the length of utf-8 | ||
* strings up-front, so we can't stream them out. | ||
* | ||
* This becomes an issue for very large strings, and should be remedied | ||
* in Bolt V2 by introducing streaming options for Strings in the same | ||
* manner we've discussed adding streaming lists. | ||
* | ||
* Once that is resolved, we could have a method here that took a | ||
* WritableByteChannel or similar instead. | ||
*/ | ||
public interface UTF8Encoder | ||
{ | ||
/** | ||
* @return a ByteBuffer with the encoded string. This will be overwritten | ||
* the next time you call this method, so use it or loose it! | ||
*/ | ||
ByteBuffer encode( String input ); | ||
|
||
static UTF8Encoder fastestAvailableEncoder() | ||
{ | ||
try | ||
{ | ||
return (UTF8Encoder)Class | ||
.forName("org.neo4j.bolt.v1.packstream.utf8.SunMiscUTF8Encoder") | ||
.getConstructor() | ||
.newInstance(); | ||
} | ||
catch ( Exception e ) | ||
{ | ||
return new VanillaUTF8Encoder(); | ||
} | ||
} | ||
|
||
|
||
} |
35 changes: 35 additions & 0 deletions
35
community/bolt/src/main/java/org/neo4j/bolt/v1/packstream/utf8/VanillaUTF8Encoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
/* | ||
* Copyright (c) 2002-2016 "Neo Technology," | ||
* Network Engine for Objects in Lund AB [http://neotechnology.com] | ||
* | ||
* This file is part of Neo4j. | ||
* | ||
* Neo4j is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
package org.neo4j.bolt.v1.packstream.utf8; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
* Simple encoder that delegates to String#getBytes() | ||
*/ | ||
public class VanillaUTF8Encoder implements UTF8Encoder | ||
{ | ||
@Override | ||
public ByteBuffer encode( String input ) | ||
{ | ||
return ByteBuffer.wrap( input.getBytes( StandardCharsets.UTF_8 ) ); | ||
} | ||
} |
Oops, something went wrong.