Skip to content

Commit

Permalink
Reduced overhead of dynamic-size tree node format
Browse files Browse the repository at this point in the history
The dynamic tree node has an offset array, which is 2 bytes per entry.
It also had 2 bytes key size and 2 bytes value size. These key/value sizes
is what this PR reduces, from 4 down to a minimum of 1 byte.

Also since the dynamic tree node format hasn't been released yet the
committed tree to compare with is also updated to this new format.
  • Loading branch information
burqen authored and tinwelint committed Feb 7, 2018
1 parent f6a87ae commit 0e1817b
Show file tree
Hide file tree
Showing 6 changed files with 340 additions and 88 deletions.
Expand Up @@ -28,28 +28,73 @@
* Gather utility methods for reading and writing individual dynamic sized
* keys. It thus define the layout for:
* - Key pointer in offset array (K*), 2B
* - Key size, 2B
* - Value size, 2B
* - Key tombstone, first bit in key size
* - keyValueSize, 1B-4B
* - Key tombstone, first bit in keyValueSize
*
* Format of key/value size is dynamic in itself, first byte being:
* <pre>
* [T,K,V,k,k,k,k,k]
* </pre>
* If {@code T} is set key is dead.
* If {@code K} is set the next byte contains the higher order bits of the key size.
* If {@code V} is set there is a value size to be read directly after key size.
* This first byte can fit key size < 32 and we only need the second byte if key size is larger.
* Together with the second byte we can fit key size < 8192.
*
* Byte following key size bytes (second or third byte depending on how many bytes needed for key size):
* <pre>
* [V,v,v,v,v,v,v,v]
* </pre>
* If {@code V} is set the next byte contains the higher order bits of the value size.
* This first value size byte can fit value size < 128 and with the second byte we can fit value size < 32768.
*
* So in total key/value size has six different looks (not including tombstone being set or not set):
* <pre>
* One byte key, no value
* [0,0,0,k,k,k,k,k]
*
* One byte key, one byte value
* [0,0,1,k,k,k,k,k][0,v,v,v,v,v,v,v]
*
* One byte key, two byte value
* [0,0,1,k,k,k,k,k][1,v,v,v,v,v,v,v][v,v,v,v,v,v,v,v]
*
* Two byte key, no value
* [0,1,0,k,k,k,k,k][k,k,k,k,k,k,k,k]
*
* Two byte key, one byte value
* [0,1,1,k,k,k,k,k][k,k,k,k,k,k,k,k][0,v,v,v,v,v,v,v]
*
* Two byte key, two byte value
* [0,1,1,k,k,k,k,k][k,k,k,k,k,k,k,k][1,v,v,v,v,v,v,v][v,v,v,v,v,v,v,v]
* </pre>
* This key/value size format is used, both for leaves and internal nodes even though internal nodes can never have values.
*
* Relative layout of key and key_value
* KeyOffset points to the exact offset where key entry or key_value entry
* can be read.
* key entry - [keySize 2B|actualKey]
* key_value entry - [keySize 2B|valueSize 2B|actualKey|actualValue]
* key entry - [keyValueSize 1B-2B|actualKey]
* key_value entry - [keyValueSize 1B-4B|actualKey|actualValue]
*
* Tombstone
* First bit in keySize is used as a tombstone, set to 1 if key is dead.
* This leaves 15 bits for actual size -> max possible key size is 32768 bytes
* which is more than page size and therefore be enough.
* First bit in keyValueSize is used as a tombstone, set to 1 if key is dead.
*/
class DynamicSizeUtil
{
static final int SIZE_OFFSET = 2;
static final int SIZE_KEY_SIZE = 2;
static final int SIZE_VALUE_SIZE = 2;
static final int SIZE_TOTAL_OVERHEAD = SIZE_OFFSET + SIZE_KEY_SIZE + SIZE_VALUE_SIZE;
private static final int FLAG_TOMBSTONE = 0x8000;

private static final int FLAG_FIRST_BYTE_TOMBSTONE = 0x80;
private static final long FLAG_READ_TOMBSTONE = 0x80000000_00000000L;
private static final int MASK_ONE_BYTE_KEY_SIZE = 0x1F;
private static final int MASK_ONE_BYTE_VALUE_SIZE = 0x7F;
private static final int FLAG_HAS_VALUE_SIZE = 0x20;
private static final int FLAG_ADDITIONAL_KEY_SIZE = 0x40;
private static final int FLAG_ADDITIONAL_VALUE_SIZE = 0x80;
private static final int SHIFT_LSB_KEY_SIZE = 5;
private static final int SHIFT_LSB_VALUE_SIZE = 7;

static void putKeyOffset( PageCursor cursor, int keyOffset )
{
Expand All @@ -63,28 +108,110 @@ static int readKeyOffset( PageCursor cursor )

static void putKeySize( PageCursor cursor, int keySize )
{
putUnsignedShort( cursor, keySize );
putKeyValueSize( cursor, keySize, 0 );
}

static void putValueSize( PageCursor cursor, int valueSize )
static void putKeyValueSize( PageCursor cursor, int keySize, int valueSize )
{
putUnsignedShort( cursor, valueSize );
boolean hasAdditionalKeySize = keySize > MASK_ONE_BYTE_KEY_SIZE;
boolean hasValueSize = valueSize > 0;

// Key size
{
byte firstByte = (byte) (keySize & MASK_ONE_BYTE_KEY_SIZE); // Least significant 5 bits
if ( hasAdditionalKeySize )
{
firstByte |= FLAG_ADDITIONAL_KEY_SIZE;
}
if ( hasValueSize )
{
firstByte |= FLAG_HAS_VALUE_SIZE;
}
cursor.putByte( firstByte );

if ( hasAdditionalKeySize )
{
// Assuming no key size larger than 4k
cursor.putByte( (byte) (keySize >> SHIFT_LSB_KEY_SIZE) );
}
}

// Value size
{
if ( hasValueSize )
{
boolean needsAdditionalValueSize = valueSize > MASK_ONE_BYTE_VALUE_SIZE;
byte firstByte = (byte) (valueSize & MASK_ONE_BYTE_VALUE_SIZE); // Least significant 7 bits
if ( needsAdditionalValueSize )
{
firstByte |= FLAG_ADDITIONAL_VALUE_SIZE;
}
cursor.putByte( firstByte );

if ( needsAdditionalValueSize )
{
// Assuming no value size larger than 16k
cursor.putByte( (byte) (valueSize >> SHIFT_LSB_VALUE_SIZE) );
}
}
}
}

/**
* Read key size including possible tombstone.
* Check for tombstone with {@link #hasTombstone(int)}.
* @param cursor On offset from where to read key size.
* @return Key size including possible tombstone.
*/
static int readKeySize( PageCursor cursor )
static long readKeyValueSize( PageCursor cursor )
{
return getUnsignedShort( cursor );
byte firstByte = cursor.getByte();
boolean hasTombstone = hasTombstone( firstByte );
boolean hasAdditionalKeySize = (firstByte & FLAG_ADDITIONAL_KEY_SIZE) != 0;
boolean hasValueSize = (firstByte & FLAG_HAS_VALUE_SIZE) != 0;
int keySizeLsb = firstByte & MASK_ONE_BYTE_KEY_SIZE;
long keySize;
if ( hasAdditionalKeySize )
{
int keySizeMsb = cursor.getByte() & 0xFF;
keySize = (keySizeMsb << SHIFT_LSB_KEY_SIZE) | keySizeLsb;
}
else
{
keySize = keySizeLsb;
}

long valueSize;
if ( hasValueSize )
{
byte firstValueByte = cursor.getByte();
int valueSizeLsb = firstValueByte & MASK_ONE_BYTE_VALUE_SIZE;
boolean hasAdditionalValueSize = (firstValueByte & FLAG_ADDITIONAL_VALUE_SIZE) != 0;
if ( hasAdditionalValueSize )
{
int valueSizeMsb = cursor.getByte() & 0xFF;
valueSize = (valueSizeMsb << SHIFT_LSB_VALUE_SIZE) | valueSizeLsb;
}
else
{
valueSize = valueSizeLsb;
}
}
else
{
valueSize = 0;
}

return (hasTombstone ? FLAG_READ_TOMBSTONE : 0) | (keySize << Integer.SIZE) | valueSize;
}

static int readValueSize( PageCursor cursor )
static int extractValueSize( long keyValueSize )
{
return getUnsignedShort( cursor );
return (int) keyValueSize;
}

static int extractKeySize( long keyValueSize )
{
return (int) ((keyValueSize & ~FLAG_READ_TOMBSTONE) >>> Integer.SIZE);
}

static boolean extractTombstone( long keyValueSize )
{
return (keyValueSize & FLAG_READ_TOMBSTONE) != 0;
}

/**
Expand All @@ -94,29 +221,24 @@ static int readValueSize( PageCursor cursor )
static void putTombstone( PageCursor cursor )
{
int offset = cursor.getOffset();
int keySize = readKeySize( cursor );
keySize = withTombstoneFlag( keySize );
byte firstByte = cursor.getByte();
firstByte = withTombstoneFlag( firstByte );
cursor.setOffset( offset );
putKeySize( cursor, keySize );
cursor.putByte( firstByte );
}

/**
* Check read key size for tombstone.
* @return True if read key size has tombstone.
*/
static boolean hasTombstone( int readKeySize )
{
return (readKeySize & FLAG_TOMBSTONE) != 0;
}

static int stripTombstone( int keySize )
private static boolean hasTombstone( byte firstKeySizeByte )
{
return keySize & ~FLAG_TOMBSTONE;
return (firstKeySizeByte & FLAG_FIRST_BYTE_TOMBSTONE) != 0;
}

private static int withTombstoneFlag( int keySize )
private static byte withTombstoneFlag( byte firstByte )
{
assert (keySize & FLAG_TOMBSTONE) == 0 : "Key size " + keySize + " is to large to fit tombstone.";
return keySize | FLAG_TOMBSTONE;
assert (firstByte & FLAG_FIRST_BYTE_TOMBSTONE) == 0 : "First key size byte " + firstByte + " is too large to fit tombstone.";
return (byte) (firstByte | FLAG_FIRST_BYTE_TOMBSTONE);
}
}

0 comments on commit 0e1817b

Please sign in to comment.