forked from attic-labs/noms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sloppy.go
240 lines (207 loc) · 7.32 KB
/
sloppy.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// Copyright 2017 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
package sloppy
import (
"github.com/ndau/noms/go/d"
)
const (
maxOffsetPOT = uint16(12)
maxTableSize = 1 << 14
maxLength = 1<<12 - 1
tableMask = maxTableSize - 1
shift = uint32(20)
)
// TODO: Make this configurable
var maxOffset = int(1<<maxOffsetPOT - 1)
// Sloppy is a logical variant of Snappy. Its purpose to provide a kind of
// estimate of how a given byte sequence *will be* compressed by Snappy. It is
// useful when a byte stream is fed into a rolling hash with the goal of
// achieving a given average chunk byte length *after compression*. Sloppy is
// logically similar to snappy, but prefers "copies" which are closer to the
// repeated byte sequence (snappy prefers to refer to the *first* instance of a
// repeated byte sequence). This is important for mitigating the likelihood that
// altering any byte in an input stream will cause chunk boundaries to be
// redrawn downstream.
//
// The high-level approach is to maintain a logical mapping between four-byte
// sequences which have been observed in the stream so-far and the integer
// offset of observed sequence (the mapping is done with a "cheap" hash-function
// which permits false-positives because they can be trivial filtered out). In
// the non-matched state, for each new byte consumed, a uint32 is computed from
// the next 4 bytes and then a look-up is performed to check for a matching 4
// bytes earlier in the stream. Snappy and sloppy behave roughly identical thus
// far.
//
// When in the "matched state" (attempting to extend the current match), Snappy
// does not re-index new 4-byte sequences, but Sloppy does. The reason for this
// is that Sloppy would like match the most recent occurence as it moves
// forward.
//
// Lastly, Sloppy adds two novel heuritics, both aimed at further mitigating
// the chance of chunk boundaries being redrawn because of byte value changes:
//
// 1) During the first 2 bytes of match, it *continues* to look for closer
// matches (effectively prefering a closer but shorter copy to a further but
// longer one). The reason for this is that when sequences repeat frequently in
// a byte stream, randomness provides for a good chance that a one or two byte
// prefix on a repeated sequence will match "far away". E.g.
//
// "23hello my friend, 12hello my friend, 01hello my friend, 23hello my friend"
//
// In the above sequence, sloppy would prefer to copy the final
// "hello my friend" 19 bytes backwards rather than "23hello my friend" quite a
// bit further.
//
// 2) Sloppy will only emit copies which are "worth it". I.e. The longer the
// reference back, the longer the length of the copy must be.
type Sloppy struct {
enc encoder
idx int
matching bool
matchOffset, matchLength int
table [maxTableSize]uint32
}
// New returns a new sloppy encoder which will encode to |f|. If |f| ever
// returns false, then encoding ends immediately. |f| is a callback because
// the primary use is that the "encoded" byte stream is fed byte-by-byte
// into a rolling hash function.
func New(f func(b byte) bool) *Sloppy {
return &Sloppy{
binaryEncoder{f},
0,
false,
0, 0,
[maxTableSize]uint32{},
}
}
// Update continues the encoding of a given input stream. The caller is expected
// to call update after having (ONLY) appended bytes to |src|. When |Update|
// returns, sloppy will have emitted 0 or more literals or copies by calling
// the |sf.f|. Note that sloppy will ALWAYS buffer the final three bytes of
// input.
func (sl *Sloppy) Update(src []byte) {
// Only consume up to the point that a "look-ahead" can include 4 bytes.
for ; sl.idx < len(src)-3; sl.idx++ {
nextHash := fbhash(load32(src, sl.idx))
if sl.matching && (sl.matchLength > maxLength || src[sl.idx] != src[sl.matchOffset+sl.matchLength]) {
// End Match
if sl.maybeCopy(src) {
return // terminate if consumer has "closed"
}
}
// Look for a match if we are beyond the first byte AND either there is no
// match yet, OR we are matching, but fewer than 3 bytes have been
// matched. The later condition allows for giving up to 2 bytes of a copy
// in order to reference a "closer" sequence. Empirical tests on
// structured data, suggests this reduces the average offset by ~2/3.
if sl.idx > 0 && (!sl.matching || sl.matchLength < 3) {
matchPos := int(sl.table[nextHash&tableMask])
if sl.idx > matchPos &&
src[sl.idx] == src[matchPos] && // filter false positives
sl.idx-matchPos <= maxOffset && // don't refer back beyond maxOffset
(!sl.matching || matchPos >= sl.matchOffset+4) { // if we are "rematching", ensure the new match is at least 4 bytes closer
if sl.matching {
// We are dropping an existing match for a closer one. Emit the
// matched bytes as literals
if sl.dontCopy(src, sl.idx-sl.matchLength, sl.idx) {
return // terminate if consumer has "closed"
}
}
// Begin a new match
sl.matching = true
sl.matchOffset = matchPos
sl.matchLength = 0
}
}
// Store new hashed offset
sl.table[nextHash&tableMask] = uint32(sl.idx)
if sl.matching {
sl.matchLength++
} else {
if sl.enc.emitLiteral(src[sl.idx]) {
return // terminate if consumer has "closed"
}
}
}
}
func (sl *Sloppy) Reset() {
sl.idx = 0
sl.matching = false
sl.matchOffset = 0
sl.matchLength = 0
sl.table = [maxTableSize]uint32{}
}
// len >= 2^(2 + log2(maxOffset) - log2(maxOffset-off)). IOW, for the first 1/2
// of the maxOffset, a copy must be >= 4. For 1/2 of what remains, a copy must
// be >= 8, etc...
func copyLongEnough(off, len uint16) bool {
d.PanicIfTrue(off == 0)
p := uint16(0)
x := (1 << maxOffsetPOT) - off
for x > 0 {
x = x >> 1
p++
}
i := maxOffsetPOT - p
min := 4
for i > 0 {
min = min << 1
i--
}
return int(len) >= min
}
// Emit matches bytes as literals.
func (sl *Sloppy) dontCopy(src []byte, from, to int) bool {
for ; from < to; from++ {
if sl.enc.emitLiteral(src[from]) {
return true
}
}
return false
}
// Emit a copy if the length is sufficient for a given offset
func (sl *Sloppy) maybeCopy(src []byte) bool {
off, len := uint16(sl.idx-(sl.matchOffset+sl.matchLength)), uint16(sl.matchLength)
sl.matching = false
sl.matchOffset = 0
sl.matchLength = 0
if !copyLongEnough(off, len) {
return sl.dontCopy(src, sl.idx-int(len), sl.idx)
}
return sl.enc.emitCopy(off, len)
}
type encoder interface {
emitLiteral(b byte) bool
emitCopy(offset, length uint16) bool
}
type binaryEncoder struct {
f func(b byte) bool
}
func (be binaryEncoder) emitLiteral(b byte) bool {
return be.f(b)
}
func (be binaryEncoder) emitCopy(offset, length uint16) bool {
// all copies are encoded as 3 bytes.
// 12 bits for offset and 12 bits for length
// 8 MSBits of offset
if be.f(byte(offset >> 4)) {
return true
}
// 4 LSBits offset | 4 MSBits length
if be.f(byte(offset<<4) | byte(length>>4)) {
return true
}
// 8 LSBits of length
if be.f(byte(length)) {
return true
}
return false
}
func fbhash(u uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
func load32(b []byte, i int) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}