From 2784675a4e67ecc443ce09e7d980ca9ec0208e00 Mon Sep 17 00:00:00 2001 From: Chris Vest Date: Wed, 30 Mar 2016 15:22:06 +0200 Subject: [PATCH] Add comment explaining how the compressed reference decoding works --- .../store/format/highlimit/Reference.java | 85 +++++++++++++++++-- 1 file changed, 76 insertions(+), 9 deletions(-) diff --git a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/store/format/highlimit/Reference.java b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/store/format/highlimit/Reference.java index d78f53301814e..101947b6b419b 100644 --- a/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/store/format/highlimit/Reference.java +++ b/enterprise/kernel/src/main/java/org/neo4j/kernel/impl/store/format/highlimit/Reference.java @@ -197,21 +197,88 @@ private static int maxBits() public static long decode( SOURCE source, DataAdapter adapter ) { - int header = adapter.getByte( source ) & 0xFF; - int sizeMarks = Integer.numberOfLeadingZeros( (~(header & 0xF8)) & 0xFF ) - 24; - int signShift = 8 - sizeMarks - (sizeMarks == 5 ? 1 : 2); - long signBit = ~((header >>> signShift) & 1) + 1; - long register = (header & ((1 << signShift) - 1)) << 16; - register += adapter.getShort( source ) & 0xFFFFL; // 3 bytes - - while ( sizeMarks > 0 ) + // Dear future maintainers, this code is a little complicated so I'm going to take some time and explain it to + // you. Make sure you have some coffee ready. + // + // Before we start, I have one plea: Please don't extract the constants out of this function. It is easier to + // make sense of them when they are embedded within the context of the code. Also, while some of the constants + // have the same value, they might change for different reasons, so let's just keep them inlined. + // + // The code is easier to read when it's all together, so I'll keep the code and the comment separate, and make + // the comment refer to the code with marks. + // + // <1> + // The first byte of a reference is the header byte. It is an unsigned byte where all the bits matter, but Java + // has no such concept as an unsigned byte, so we instead store the byte in a 32-bit int, and mask it with 0xFF + // to read it as if it was unsigned. The 0xFF mask makes sure that the highest-order bit, which would otherwise + // be used as a sign-bit, stays together with the other 7 bits in the lowest-order byte of the int. + // + // <2> + // The header determines how many bytes go into the reference. These are the size marks. If the first bit of + // the header is zero, then we have zero size marks and the reference takes up 3 bytes. If the header starts + // with the bits 10, then we have one size mark and the reference takes up 4 bytes. We can have up to 5 size + // marks, where the last two options are 11110 for a 7 byte reference, and 11111 for an 8 byte reference. + // We count the size marks as follows: + // 1. First extract the 5 high-bits. 0xF8 is 11111000, so xxxx_xxxx & 0xF8 => xxxx_x000. + // 2. The x'es are a number of ones, possibly zero, followed by a zero. There's an instruction to count + // leading zeros, but not leading ones, so we have to invert the 1 size marks into 0s, and the possible 0 + // end mark into a 1. We use the `& 0xFF` trick to prevent the leading size mark from turning into a + // sign-bit. So (~xxxx_x000) & 0xFF => XXXX_X111, e.g. 0111_1000 (no size marks) becomes 1000_0111, and + // 1101_1000 (two size marks) becomes 0010_0111. + // 3. Now we can count the leading zeros to find the end mark. Remember that the end-mark is the zero-bit after + // the size marks. We *always* have this end-mark at this point, because any 1 in the highest-bit of the + // reference was masked to 0 in step 1 above. + // 4. When we count the number of leading zeros, we have thus far been thinking about the header as a single + // byte. However, the register we have been working on is a 32-bit integer, so we have to subtract 3 times 8 + // bits to get the number of size marks in the original header *byte*. + // + // <3> + // The sign-bit is located after the end-mark, or after the last size mark in the case of an 8 byte reference. + // We have 8 bits in the header byte, so if we want to place the sign-bit at the lowest-order bit location, + // then we can think of the size marks and optional end-mark as a pre-shift, pushing the sign-bit towards the + // low end. We just have to figure out how many bits are left to shift over. + // + // <4> + // If the sign-bit is 1, then we want to produce the 64-bit signed integer number -1, which consists of 64 + // consecutive 1-bits. If the sign-bit is 0, then we want to produce 0, which in binary is 64 consecutive + // 0-bits. The reason we do this is how negative numbers work. It turns out that -X == -1 ^ (X - 1). Since + // our compression scheme is all about avoiding the storage of unnecessary high-order zeros, we can more easily + // store the (X - 1) part plus a sign bit, than a long string of 1-bits followed by useful data. For example, + // the negative number -42 is 1111111111111111111111111111111111111111111111111111111111010110 in binary, + // while 41 is just 101001. And given our equation above, -1 ^ 41 == -42. + // + // <5> + // After the size marks, the end-mark and the sign-bit comes a few bits of payload data. The sign-bit location + // marks the end of the meta-data bits, so we use that as a base for computing a mask that will remove all the + // meta-data bits. Since the smallest reference takes up 3 bytes, we can immediately shift those payload bits + // up 16 places to make room for the next two bytes of payload. + // + // <6> + // Then we read the next two bytes (with unsigned mask) and save for the sign-bit manipulation, we now have a + // complete 3-byte reference. + // + // <7> + // The size marks determines how many more bytes the reference takes up, so we loop through them and shift the + // register up 8 places every time, and add in the next byte with an unsigned mask. + // + // <8> + // Finally XOR the register with the sign component and we have our final value. + + int header = adapter.getByte( source ) & 0xFF; // <1> + int sizeMarks = Integer.numberOfLeadingZeros( (~(header & 0xF8)) & 0xFF ) - 24; // <2> + int signShift = 8 - sizeMarks - (sizeMarks == 5 ? 1 : 2); // <3> + long signComponent = ~((header >>> signShift) & 1) + 1; // <4> + long register = (header & ((1 << signShift) - 1)) << 16; // <5> + register += adapter.getShort( source ) & 0xFFFFL; // <6> + + while ( sizeMarks > 0 ) // <7> { register <<= 8; register += adapter.getByte( source ) & 0xFF; sizeMarks--; } - return signBit ^ register; + return signComponent ^ register; // <8> } /**