Rework fast matching

This makes several changes that in combination gives close to the same compression, but with a big speedup in most cases. We change the hash table to contain hashes of 6 bytes. The speed is about the same, but this usually gives better compression since hashes are of better quality. This typically also makes the content faster to decode since longer matches are preferred. Hash table size is now defined separately of window size. I found that 16 bits was a good value, especially since the better hash table opens up for other optimization. We check 3 bytes, then skip one (plus more if data is hard to compress. This gives most of the speedup, but also looses us some compression. We index 2 bytes before the end of each match. This doesn't impact speed much and gives a nice compression boost. This combines well with #49 (not included in this benchmark) Now for the numbers. They are all before/after, best of 2 runs. ``` file out level insize outsize millis mb/s consensus.db.10gb lz4 0 10737418240 5057961420 35446 288.88 consensus.db.10gb lz4 0 10737418240 5077608378 23226 440.87 file out level insize outsize millis mb/s rawstudio-mint14.tar lz4 0 8558382592 4568741520 25369 321.73 rawstudio-mint14.tar lz4 0 8558382592 4592776475 17168 475.41 file out level insize outsize millis mb/s github-ranks-backup.bin lz4 0 1862623243 579273817 4074 436.02 github-ranks-backup.bin lz4 0 1862623243 627056167 3522 504.35 file out level insize outsize millis mb/s github-june-2days-2019.json lz4 0 6273951764 1355117284 10763 555.86 github-june-2days-2019.json lz4 0 6273951764 1293582359 9136 654.91 file out level insize outsize millis mb/s gob-stream lz4 0 1911399616 384235547 3481 523.66 gob-stream lz4 0 1911399616 384292384 2827 644.80 file out level insize outsize millis mb/s 10gb.tar lz4 0 10065157632 6481808453 23629 406.23 10gb.tar lz4 0 10065157632 5902162074 22592 424.88 file out level insize outsize millis mb/s enwik9 lz4 0 1000000000 489160425 3733 255.47 enwik9 lz4 0 1000000000 482276927 3520 270.93 file out level insize outsize millis mb/s silesia.tar lz4 0 211947520 99218419 691 292.51 silesia.tar lz4 0 211947520 96766005 590 342.01 file out level insize outsize millis mb/s sharnd.out lz4 0 500000000 500000495 169 2821.52 sharnd.out lz4 0 500000000 500000495 166 2872.51 ``` Only [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) has a significant size increase. The others are very close or better than before. All show minor to a significant speedup.
pierrec · Aug 1, 2019 · a207029 · a207029
1 parent 377214e
commit a207029
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 33 deletions.
diff --git a/bench_test.go b/bench_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func BenchmarkCompress(b *testing.B) {
-	var hashTable [1 << 16]int
+	var hashTable [htSize]int
 	buf := make([]byte, len(pg1661))
 
 	b.ReportAllocs()
@@ -22,7 +22,7 @@ func BenchmarkCompress(b *testing.B) {
 }
 
 func BenchmarkCompressRandom(b *testing.B) {
-	var hashTable [1 << 16]int
+	var hashTable [htSize]int
 	buf := make([]byte, len(randomLZ4))
 
 	b.ReportAllocs()

diff --git a/block.go b/block.go
@@ -2,13 +2,14 @@ package lz4
 
 import (
 	"encoding/binary"
+	"fmt"
 	"math/bits"
 )
 
-// blockHash hashes 4 bytes into a value < winSize.
-func blockHash(x uint32) uint32 {
-	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
-	return x * hasher >> hashShift
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
 }
 
 // CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
@@ -46,33 +47,62 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
 	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
 	const adaptSkipLog = 7
-
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
 		return 0, nil
 	}
-	var si int
-
-	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	if len(hashTable) < htSize {
+		return 0, fmt.Errorf("hash table too small, should be at least %d in size", htSize)
+	}
+	// Prove to the compiler the table has at least htSize elements.
+	// The compiler can see that "uint32() >> hashShift" cannot be out of bounds.
+	hashTable = hashTable[:htSize]
 
-	anchor := si // Position of the current literals.
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, anchor int
 
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
 	for si < sn {
-		// Hash the next 4 bytes (sequence)...
-		match := binary.LittleEndian.Uint32(src[si:])
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
 		h := blockHash(match)
+		h2 := blockHash(match >> 8)
 
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
 		ref := hashTable[h]
+		ref2 := hashTable[h2]
 		hashTable[h] = si
-		if ref >= sn { // Invalid reference (dirty hashtable).
-			si += 1 + (si-anchor)>>adaptSkipLog
-			continue
-		}
+		hashTable[h2] = si + 1
 		offset := si - ref
+
+		// If offset <= 0 we got an old entry in the hash table.
 		if offset <= 0 || offset >= winSize || // Out of window.
-			match != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
-			si += 1 + (si-anchor)>>adaptSkipLog
-			continue
+			uint32(match) != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref = hashTable[h]
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize ||
+				uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref
+				hashTable[h] = si
+
+				if offset <= 0 || offset >= winSize ||
+					uint32(match>>16) != binary.LittleEndian.Uint32(src[ref:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
 		}
 
 		// Match found.
@@ -134,6 +164,13 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 			dst[di] = byte(mLen)
 			di++
 		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		hashTable[h] = si - 2
 	}
 
 	if anchor == 0 {
@@ -165,6 +202,12 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 	return di, nil
 }
 
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
 // CompressBlockHC compresses the source buffer src into the destination dst
 // with max search depth (use 0 or negative value for no max).
 //
@@ -199,7 +242,7 @@ func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 	for si < sn {
 		// Hash the next 4 bytes (sequence).
 		match := binary.LittleEndian.Uint32(src[si:])
-		h := blockHash(match)
+		h := blockHashHC(match)
 
 		// Follow the chain until out of window and give the longest match.
 		mLen := 0
@@ -251,7 +294,7 @@ func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 		for si, ml := winStart, si+mLen; si < ml; {
 			match >>= 8
 			match |= uint32(src[si+3]) << 24
-			h := blockHash(match)
+			h := blockHashHC(match)
 			chainTable[si&winMask] = hashTable[h]
 			hashTable[h] = si
 			si++

diff --git a/block_test.go b/block_test.go
@@ -11,8 +11,11 @@ import (
 	"github.com/pierrec/lz4"
 )
 
-// Hash table size.
-const htSize = 1 << 16 // 64kb
+const (
+	// Should match values in lz4.go
+	hashLog = 16
+	htSize  = 1 << hashLog
+)
 
 type testcase struct {
 	file         string
@@ -22,11 +25,11 @@ type testcase struct {
 
 var rawFiles = []testcase{
 	// {"testdata/207326ba-36f8-11e7-954a-aca46ba8ca73.png", true, nil},
-	{"testdata/e.txt", true, nil},
+	{"testdata/e.txt", false, nil},
 	{"testdata/gettysburg.txt", true, nil},
 	{"testdata/Mark.Twain-Tom.Sawyer.txt", true, nil},
 	{"testdata/pg1661.txt", true, nil},
-	{"testdata/pi.txt", true, nil},
+	{"testdata/pi.txt", false, nil},
 	{"testdata/random.data", false, nil},
 	{"testdata/repeat.txt", true, nil},
 	{"testdata/pg1661.txt", true, nil},
@@ -125,10 +128,12 @@ func TestCompressCornerCase_CopyDstUpperBound(t *testing.T) {
 		t.Helper()
 
 		// Compress the data.
-		zbuf := make([]byte, int(float64(len(src))*0.85))
+		// We provide a destination that is too small to trigger an out-of-bounds,
+		// which makes it return the error we want.
+		zbuf := make([]byte, int(float64(len(src))*0.40))
 		_, err := compress(src, zbuf)
 		if err != lz4.ErrInvalidSourceShortBuffer {
-			t.Fatal("err should be ErrInvalidSourceShortBuffer")
+			t.Fatal("err should be ErrInvalidSourceShortBuffer, was", err)
 		}
 	}
 
@@ -154,9 +159,9 @@ func TestCompressCornerCase_CopyDstUpperBound(t *testing.T) {
 }
 
 func TestIssue23(t *testing.T) {
-	compressBuf := make([]byte, lz4.CompressBlockBound(htSize))
+	compressBuf := make([]byte, lz4.CompressBlockBound(1<<16))
 	for j := 1; j < 16; j++ {
-		var buf [htSize]byte
+		var buf [1 << 16]byte
 		var ht [htSize]int
 
 		for i := 0; i < len(buf); i += j {

diff --git a/lz4.go b/lz4.go
@@ -30,9 +30,9 @@ const (
 	// hashLog determines the size of the hash table used to quickly find a previous match position.
 	// Its value influences the compression speed and memory usage, the lower the faster,
 	// but at the expense of the compression ratio.
-	// 16 seems to be the best compromise.
-	hashLog   = 16
-	hashShift = uint((minMatch * 8) - hashLog)
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
 
 	mfLimit = 8 + minMatch // The last match cannot start within the last 12 bytes.
 )

diff --git a/testdata/upperbound.data b/testdata/upperbound.data