Skip to content

Commit

Permalink
Faster decoding of short strings
Browse files Browse the repository at this point in the history
by not using Bits, which means a bit more tailored decoding code
as well as skipping one data copy and the Bits instance itself.
  • Loading branch information
tinwelint committed May 10, 2016
1 parent f1aaf17 commit 9066806
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 36 deletions.
Expand Up @@ -194,8 +194,7 @@ long longValue()
String shortStringValue()
{
assertOfType( SHORT_STRING );
Bits bits = valueAsBits();
return LongerShortString.decode( bits );
return LongerShortString.decode( data, position, currentBlocksUsed() );
}

String stringValue()
Expand Down Expand Up @@ -356,9 +355,6 @@ private static Object readArrayFromBuffer( ByteBuffer buffer )

private void assertOfType( PropertyType expected )
{
if ( type() != expected )
{
throw new IllegalStateException( "Expected type " + expected + " but was " + type() );
}
assert type() == expected : "Expected type " + expected + " but was " + type();
}
}
Expand Up @@ -497,6 +497,7 @@ char decTranslate( byte codePoint )
public static final int ALL_BIT_MASK = bitMask( LongerShortString.values() );
public static final int ENCODING_UTF8 = 0;
public static final int ENCODING_LATIN1 = 10;
private static final int HEADER_SIZE = 39; // bits

final int encodingHeader;
final long mask;
Expand Down Expand Up @@ -763,37 +764,48 @@ private static void writeHeader( Bits bits, int keyId, int encoding, int stringL
*/
public static String decode( PropertyBlock block )
{
Bits bits = Bits.bitsFromLongs( block.getValueBlocks() );
return decode( bits );
return decode( block.getValueBlocks(), 0, block.getValueBlocks().length );
}

public static String decode(Bits bits)
public static String decode( long[] blocks, int offset, int length )
{
long firstLong = bits.getLongs()[0];
long firstLong = blocks[offset];
if ( ( firstLong & 0xFFFFFF0FFFFFFFFFL ) == 0 ) return "";
bits.getInt( 24 ); // Get rid of the key
bits.getByte( 4 ); // Get rid of the type
int encoding = bits.getByte( 5 ); //(int) ( ( firstLong & 0xF00000000L ) >>> 32 );
int stringLength = bits.getByte( 6 ); //(int) ( ( firstLong & 0xFC000000L ) >>> 26 );
if ( encoding == LongerShortString.ENCODING_UTF8 ) return decodeUTF8( bits, stringLength );
if ( encoding == ENCODING_LATIN1 ) return decodeLatin1( bits, stringLength );
// key(24b) + type(4) = 28
int encoding = (int) ((firstLong & 0x1F0000000L) >>> 28); // 5 bits of encoding
int stringLength = (int) ((firstLong & 0x7E00000000L) >>> 33); // 6 bits of stringLength
if ( encoding == LongerShortString.ENCODING_UTF8 ) return decodeUTF8( blocks, offset, stringLength );
if ( encoding == ENCODING_LATIN1 ) return decodeLatin1( blocks, offset, stringLength );

LongerShortString table = getEncodingTable( encoding );
assert table != null: "We only decode LongerShortStrings after we have consistently read the PropertyBlock " +
"data from the page cache. Thus, we should never have an invalid encoding header here.";
char[] result = new char[stringLength];
// encode shifts in the bytes with the first char at the MSB, therefore
// we must "unshift" in the reverse order
for ( int i = 0; i < stringLength; i++ )
{
byte codePoint = bits.getByte( table.step );
result[i] = table.decTranslate( codePoint );
}
decode( result, blocks, offset, table );

// We know the char array is unshared, so use sharing constructor explicitly
return UnsafeUtil.newSharedArrayString( result );
}

private static void decode( char[] result, long[] blocks, int offset, LongerShortString table )
{
int block = offset;
int maskShift = HEADER_SIZE;
long baseMask = table.mask;
for ( int i = 0; i < result.length; i++ )
{
byte codePoint = (byte) ((blocks[block] >>> maskShift) & baseMask);
maskShift += table.step;
if ( maskShift >= 64 && block + 1 < blocks.length )
{
maskShift %= 64;
codePoint |= (blocks[++block] & (baseMask >>> (table.step-maskShift))) << (table.step-maskShift);
}
result[i] = table.decTranslate( codePoint );
}
}

// lookup table by encoding header
// +2 because of ENCODING_LATIN1 gap and one based index
Expand Down Expand Up @@ -896,22 +908,40 @@ private void translateData(Bits bits, byte[] data, int length, final int step)
}
}

private static String decodeLatin1( Bits bits, int stringLength )
{ // see decode
private static String decodeLatin1( long[] blocks, int offset, int stringLength )
{
char[] result = new char[stringLength];
for ( int i = 0; i < stringLength; i++ )
int block = offset;
int maskShift = HEADER_SIZE;
for ( int i = 0; i < result.length; i++ )
{
result[i] = (char) bits.getShort( 8 );
char codePoint = (char) ((blocks[block] >>> maskShift) & 0xFF);
maskShift += 8;
if ( maskShift >= 64 )
{
maskShift %= 64;
codePoint |= (blocks[++block] & (0xFF >>> (8-maskShift))) << (8-maskShift);
}
result[i] = codePoint;
}
return new String( result );
return UnsafeUtil.newSharedArrayString( result );
}

private static String decodeUTF8( Bits bits, int stringLength )
private static String decodeUTF8( long[] blocks, int offset, int stringLength )
{
byte[] result = new byte[stringLength];
for ( int i = 0; i < stringLength; i++ )
int block = offset;
int maskShift = HEADER_SIZE;
for ( int i = 0; i < result.length; i++ )
{
result[i] = bits.getByte();
byte codePoint = (byte) (blocks[block] >>> maskShift);
maskShift += 8;
if ( maskShift >= 64 )
{
maskShift %= 64;
codePoint |= (blocks[++block] & (0xFF >>> (8-maskShift))) << (8-maskShift);
}
result[i] = codePoint;
}
try
{
Expand All @@ -930,13 +960,6 @@ public static int calculateNumberOfBlocksUsed( long firstBlock )
*/
int encoding = (int) ( ( firstBlock & 0x1F0000000L ) >> 28 );
int length = (int) ( ( firstBlock & 0x7E00000000L ) >> 33 );
/*
Bits bits = Bits.bitsFromLongs( new long[] {firstBlock} );
bits.getInt( 24 ); // key
bits.getByte( 4 ); // type
int encoding = bits.getByte( 5 );
int length = bits.getByte( 6 );
*/
if ( encoding == ENCODING_UTF8 || encoding == ENCODING_LATIN1 )
{
return calculateNumberOfBlocksUsedForStep8(length);
Expand Down

0 comments on commit 9066806

Please sign in to comment.