/
bloom.go
183 lines (159 loc) · 5.88 KB
/
bloom.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
package bloom
import (
"encoding/binary"
"math"
)
const (
// Used for a sanity check that we don't allocate too much when determining
// bloom filter size in NewFilterWithFalsePositiveRate.
maxBuckets = 104857600
// We use uint32 for bucket size internally
bytesPerBucket = 4
)
func optimalSubhashes(numBuckets int, elementCount int) int {
return int(math.Ceil(float64(numBuckets) / float64(elementCount) * math.Ln2))
}
func requiredBuckets(elementCount int, falsePositiveRate float64) int {
return int(math.Ceil(-(float64(elementCount) * math.Log(falsePositiveRate) / (math.Pow(math.Ln2, 2)))))
}
// FilterSizeForFalsePositiveRate returns the memory footprint for a filter with
// the given constraints.
func FilterSizeForFalsePositiveRate(falsePositiveRate float64, elementCount int) int {
return requiredBuckets(elementCount, falsePositiveRate) * bytesPerBucket
}
// NewFilterWithSize constructs a new bloom filter with the given constraints.
// * 'bytes' - the number of bytes to allocate for the filter buckets
// * 'elementCount' - the expected number of elements that will be present in
// the filter
// Given these, the filter will choose the optimal number of hashes to perform
// to get the best possible false-positive rate.
func NewFilterWithSize(bytes int, elementCount int) *BloomFilter {
if elementCount <= 0 {
panic("Element count must be greater than 0")
}
if bytes <= 0 {
panic("Bytes must be greater than 0")
}
numBuckets := bytes / bytesPerBucket
filter := &BloomFilter{
NumSubhashes: uint32(optimalSubhashes(numBuckets, elementCount)),
Buckets: make([]uint32, numBuckets),
}
return filter
}
// NewFilterWithFalsePositiveRate constructs a new bloom filter with the given
// constraints:
// * 'falsePositiveRate' - the expected false positive rate when the filter is
// loaded with 'elementCount' elements
// * 'elementCount' - the expected number of elements that will be present in
// the filter
// * 'maxBytes' - the maximum memory footprint to allocate for this filter.
// Given these, the filter will choose a size that will provide the specified
// constraints. If the memory footprint is capped by 'maxBytes', the
// falsePositiveRate will increase.
func NewFilterWithFalsePositiveRate(falsePositiveRate float64, elementCount int, maxBytes int) *BloomFilter {
if falsePositiveRate <= 0.0 || falsePositiveRate >= 1.0 {
panic("Expected false positive rate to be a value between 0 and 1")
}
if elementCount <= 0 {
panic("Element count must be greater than 0")
}
if maxBytes <= 0 {
panic("Max bytes must be greater than 0")
}
maxBuckets := maxBytes / bytesPerBucket
numBuckets := int(math.Min(float64(requiredBuckets(elementCount, falsePositiveRate)), float64(maxBuckets)))
filter := &BloomFilter{
NumSubhashes: uint32(optimalSubhashes(numBuckets, elementCount)),
Buckets: make([]uint32, numBuckets),
}
return filter
}
// FalsePositiveRate returns the expected rate of false positives if the filter
// contains the given number of elements.
func (f *BloomFilter) FalsePositiveRate(elementCount int) float64 {
// m: number of buckets, k: number of hashes
m := float64(len(f.Buckets))
k := float64(f.NumSubhashes)
return math.Pow(1-math.Pow(1-1/m, k*float64(elementCount)), k)
}
// OverflowRate iterates over all buckets in the filter and returns the ratio
// of buckets that are in overflow to the total number of buckets. Once a
// bucket overflows, it will never return to normal, even if all items
// corresponding to that bucket are removed - a new filter must be constructed
// instead.
func (f *BloomFilter) OverflowRate() float64 {
numOverflow := 0
for _, value := range f.Buckets {
if value == math.MaxUint32 {
numOverflow++
}
}
return float64(numOverflow) / float64(len(f.Buckets))
}
// Add adds an item to the filter.
func (f *BloomFilter) Add(hash []byte) {
f.forEachSubhash(hash, func(i int) {
// If we overflow the bucket, we don't want it to wrap back to zero, just
// leave it at MaxUint32 and it can never be removed from the filter.
value := f.Buckets[i]
if value != math.MaxUint32 {
f.Buckets[i] = value + 1
}
})
}
// Remove removes an item from the filter
func (f *BloomFilter) Remove(hash []byte) {
f.forEachSubhash(hash, func(i int) {
// We can't remove from an overflowed bucket, as the information has already
// been lost - just leave it at MaxUint32
value := f.Buckets[i]
if value != 0 && value != math.MaxUint32 {
f.Buckets[i] = value - 1
}
})
}
// UpperBoundCount returns the maximum number of items in the filter matching
// the given hash.
func (f *BloomFilter) UpperBoundCount(hash []byte) int {
lowerBound := int(math.MaxUint32)
f.forEachSubhash(hash, func(i int) {
value := int(f.Buckets[i])
if value < lowerBound {
lowerBound = value
}
})
return lowerBound
}
// IsNotPresent returns true if an item corresponding to the given hash has
// definitely not been added to the set.
func (f *BloomFilter) IsNotPresent(hash []byte) bool {
return f.UpperBoundCount(hash) == 0
}
func (f *BloomFilter) forEachSubhash(hash []byte, cb func(int)) {
// This is required so that we can guarantee hash generation in overflow
if len(hash) < 8 {
panic("Bloom filter hashes must be at least 8 bytes")
}
index := uint(0)
overflowInitialized := false
var overflowFirst, overflowSecond uint32
for hashes := uint32(0); hashes < f.NumSubhashes; hashes++ {
var value uint32
if index+4 < uint(len(hash)) {
subhash := hash[index : index+4]
value = binary.LittleEndian.Uint32(subhash)
index += 4
} else {
// This should be an ok way to extend the hash, via https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
if !overflowInitialized {
subhash := hash[0:4]
overflowFirst = binary.LittleEndian.Uint32(subhash)
subhash = hash[4:8]
overflowSecond = binary.LittleEndian.Uint32(subhash)
}
value = overflowFirst + hashes*(overflowSecond)
}
cb(int(value) % len(f.Buckets))
}
}