forked from seiflotfy/loglogbeta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
loglogbeta.go
141 lines (119 loc) · 2.88 KB
/
loglogbeta.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package loglogbeta
import (
"bytes"
"encoding/gob"
"math"
bits "github.com/dgryski/go-bits"
metro "github.com/dgryski/go-metro"
)
const (
precision = 14
m = uint32(1 << precision)
max = 64 - precision
maxX = math.MaxUint64 >> max
version = 1
)
func beta(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.370393911*ez +
0.070471823*zl +
0.17393686*math.Pow(zl, 2) +
0.16339839*math.Pow(zl, 3) +
-0.09237745*math.Pow(zl, 4) +
0.03738027*math.Pow(zl, 5) +
-0.005384159*math.Pow(zl, 6) +
0.00042419*math.Pow(zl, 7)
}
func alpha(m float64) float64 {
switch m {
case 16:
return 0.673
case 32:
return 0.697
case 64:
return 0.709
}
return 0.7213 / (1 + 1.079/m)
}
func regSumAndZeros(registers [m]uint8) (float64, float64) {
sum, ez := 0.0, 0.0
for _, val := range registers {
if val == 0 {
ez++
}
sum += 1.0 / math.Pow(2.0, float64(val))
}
return sum, ez
}
func getPosVal(x uint64) (uint64, uint8) {
val := uint8(bits.Clz((x<<precision)^maxX)) + 1
k := x >> uint(max)
return k, val
}
// LogLogBeta is a sketch for cardinality estimation based on LogLog counting
type LogLogBeta struct {
registers [m]uint8
alpha float64
}
type savedLLB struct {
Registers [m]uint8
Alpha float64
Version int
}
// New returns a LogLogBeta
func New() *LogLogBeta {
return &LogLogBeta{
registers: [m]uint8{},
alpha: alpha(float64(m)),
}
}
// AddHash ...
func (llb *LogLogBeta) AddHash(x uint64) {
k, val := getPosVal(x)
if llb.registers[k] < val {
llb.registers[k] = val
}
}
// Add inserts a value into the sketch
func (llb *LogLogBeta) Add(value []byte) {
x := metro.Hash64(value, 1337)
llb.AddHash(x)
}
// Cardinality returns the number of unique elements added to the sketch
func (llb *LogLogBeta) Cardinality() uint64 {
sum, ez := regSumAndZeros(llb.registers)
m := float64(m)
return uint64(llb.alpha * m * (m - ez) / (beta(ez) + sum))
}
// Merge takes another LogLogBeta and combines it with llb one, making llb the union of both.
func (llb *LogLogBeta) Merge(other *LogLogBeta) {
for i, v := range llb.registers {
if v < other.registers[i] {
llb.registers[i] = other.registers[i]
}
}
}
// MarshalBinary implements the encoding.BinaryMarshaler interface.
func (llb *LogLogBeta) MarshalBinary() (data []byte, err error) {
var buf bytes.Buffer
enc := gob.NewEncoder(&buf)
err = enc.Encode(savedLLB{
Version: version,
Alpha: llb.alpha,
Registers: llb.registers})
return buf.Bytes(), err
}
// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
func (llb *LogLogBeta) UnmarshalBinary(data []byte) error {
// Unmarshal version. We may need this in the future if we make
// non-compatible changes.
var sllb savedLLB
dec := gob.NewDecoder(bytes.NewReader(data))
err := dec.Decode(&sllb)
if err != nil {
return err
}
llb.registers = sllb.Registers
llb.alpha = sllb.Alpha
return nil
}