Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new-feature: trie: add removeSameLeaf() to remove leaves with the same value #100

Merged
merged 3 commits into from
May 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ func Example_indexRanges() {
// Alison |

{Key: "Aaron", Offset: 0},
{Key: "Agatha", Offset: 0},
{Key: "Al", Offset: 0},
{Key: "Albert", Offset: 0},

{Key: "Alexander", Offset: 31},
Expand Down
2 changes: 2 additions & 0 deletions index/example_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ func Example_indexRanges() {
// Alison |

{Key: "Aaron", Offset: 0},
{Key: "Agatha", Offset: 0},
{Key: "Al", Offset: 0},
{Key: "Albert", Offset: 0},

{Key: "Alexander", Offset: 31},
Expand Down
2 changes: 1 addition & 1 deletion trie/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import "github.com/openacid/errors"
var (
// ErrTooManyTrieNodes indicates the number of trie nodes(not number of
// keys) exceeded.
ErrTooManyTrieNodes = errors.New("exceeds max node count=65536")
ErrTooManyTrieNodes = errors.New("exceeds max node count=2^31-1")

// ErrTrieBranchValueOverflow indicate input key consists of a word greater
// than the max 4-bit word(0x0f).
Expand Down
65 changes: 26 additions & 39 deletions trie/example_slimtrie_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,17 @@ func ExampleSlimTrie_RangeGet() {

// To index a map of key range to value with SlimTrie is very simple:
//
// Just give two adjacent keys the same value, then SlimTrie knows these
// keys belong to a "range".
// These two keys are left and right boundaries of a range, and are both
// inclusive.
//
// In this example we:
//
// map [abc, abcd] to 1
// map [bc, bc] to 2 // this range has only one key in it.
// map [bcd, bce] to 3
//
// With RangeGet() to get any key that "abc" <= key <= "abcd", such as
// "abc1", "abc2"... should return "1".
//
// False Positive
//
// Just like Bloomfilter, SlimTrie does not contains full information of keys,
// thus there could be a false positive return:
// It returns some value and "true" but the key is not in there.
// Gives a set of key the same value, and use RangeGet() instead of Get().
// SlimTrie does not store branches for adjacent leaves with the same value.

keys := []string{
"abc", "abcd",
"abc",
"abcd",

"bc",
"bcd", "bce",

"bcd",
"bce",
}
values := []int{
1, 1,
Expand All @@ -49,23 +36,23 @@ func ExampleSlimTrie_RangeGet() {
key string
msg string
}{
{"ab", "smaller than any"},
{"ab", "FALSE POSITIVE: all known key starts with a are mapped to 1"},

{"abc", "in range [abc, abcd]"},
{"abc1", "in range [abc, abcd]"},
{"abc2", "in range [abc, abcd]"},
{"abcd", "in range [abc, abcd]"},
{"abc", "in range"},
{"abc1", "FALSE POSITIVE"},
{"abc2", "FALSE POSITIVE"},
{"abcd", "in range"},

{"abcde", "FALSE POSITIVE: a suffix of abcd"},

{"acc", "FALSE POSITIVE: not in range [abc, abcd]"},
{"acc", "FALSE POSITIVE"},

{"bc", "in single key range [bc]"},
{"bc1", "not in single key range [bc]"},
{"bc1", "FALSE POSITIVE"},

{"bcd1", "in range [bcd, bce]"},
{"bcd1", "FALSE POSITIVE"},

{"def", "greater than any"},
{"def", "FALSE POSITIVE"},
}

for _, c := range cases {
Expand All @@ -74,15 +61,15 @@ func ExampleSlimTrie_RangeGet() {
}

// Output:
// ab <nil> false: smaller than any
// abc 1 true : in range [abc, abcd]
// abc1 1 true : in range [abc, abcd]
// abc2 1 true : in range [abc, abcd]
// abcd 1 true : in range [abc, abcd]
// ab 1 true : FALSE POSITIVE: all known key starts with a are mapped to 1
// abc 1 true : in range
// abc1 1 true : FALSE POSITIVE
// abc2 1 true : FALSE POSITIVE
// abcd 1 true : in range
// abcde 1 true : FALSE POSITIVE: a suffix of abcd
// acc 1 true : FALSE POSITIVE: not in range [abc, abcd]
// acc 1 true : FALSE POSITIVE
// bc 2 true : in single key range [bc]
// bc1 <nil> false: not in single key range [bc]
// bcd1 3 true : in range [bcd, bce]
// def <nil> false: greater than any
// bc1 2 true : FALSE POSITIVE
// bcd1 3 true : FALSE POSITIVE
// def 3 true : FALSE POSITIVE
}
49 changes: 15 additions & 34 deletions trie/slimtrie.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,8 @@
// Actually besides as a key value map,
// to index a map of key range to value with SlimTrie is also very simple:
//
// Just give two adjacent keys the same value, then SlimTrie
// knows these keys belong to a "range".
// These two keys are left and right boundaries of a range, and are both
// inclusive.
//
// // a to g --> 1
// // h --> 2
// st, err := NewSlimTrie(encode.Int{}, []string{"a", "g", "h"}, []int{1, 1, 2})
//
// st.Get("a") // 1, true A normal key-value Get()
// st.Get("c") // nil, false A key-value Get() got nothing.
// st.RangeGet("c") // 1, true A range get got 1
// st.RangeGet("g") // 1, true
// st.RangeGet("h") // 2, true
// Gives a set of key the same value, and use RangeGet() instead of Get().
// SlimTrie does not store branches for adjacent leaves with the same value.
//
// See SlimTrie.RangeGet .
//
Expand All @@ -55,8 +43,8 @@ const (
WordMask = 0xf
// LeafWord is a special value to indicate a leaf node in a Trie.
LeafWord = byte(0x10)
// MaxNodeCnt is the max number of node(including leaf and inner node).
MaxNodeCnt = 65536
// MaxNodeCnt is the max number of node. Node id in SlimTrie is int32.
MaxNodeCnt = (1 << 31) - 1
)

// SlimTrie is a space efficient Trie index.
Expand Down Expand Up @@ -123,6 +111,8 @@ func (st *SlimTrie) loadBytes(keys [][]byte, values interface{}) (err error) {
return err
}

trie.removeSameLeaf()

err = st.LoadTrie(trie)
return err
}
Expand Down Expand Up @@ -230,36 +220,27 @@ func (st *SlimTrie) LoadTrie(root *Node) (err error) {
// Since 0.4.3
func (st *SlimTrie) RangeGet(key string) (interface{}, bool) {

lID, eqID, rID := st.searchID(key)
lID, eqID, _ := st.searchID(key)

// an "equal" macth means key is a prefix of either start or end of a range.
if eqID != -1 {
return st.Leaves.Get(eqID)
v, found := st.Leaves.Get(eqID)
if found {
return v, found
}

// else: maybe matched at a inner node.
}

// key is smaller than any range-start or range-end.
if lID == -1 {
return nil, false
}

// key is greater than any range-start or range-end.
if rID == -1 {
return nil, false
}
// Preceding value is the start of this range.
// It might be a false-positive

lVal, _ := st.Leaves.Get(lID)
rVal, _ := st.Leaves.Get(rID)

// If left-value != right-value, the key is between a range-end and next
// range-start.
if lVal != rVal {
return nil, false
}

// If range[i].end == range[i+1].start, it is a false positive.
// SlimTrie can not distinguish this from a positive match.
//
// Otherwise, lVal and rVal must be the start and end of a single range.
return lVal, true
}

Expand Down
97 changes: 61 additions & 36 deletions trie/slimtrie_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,51 +98,44 @@ func unsquashedIntSlimTrie(t *testing.T, keys []string, values interface{}) *Sli

func TestMaxKeys(t *testing.T) {

nn := 16
mx := 32768
ta := require.New(t)

keys := make([][]byte, 0, mx)
values := make([]interface{}, 0, mx)
nn := 256
// a milllion keys
mx := nn * nn * 16

keys := make([]string, 0, mx)
values := make([]int32, 0, mx)

for i := 0; i < nn; i++ {
for j := 0; j < nn; j++ {
for k := 0; k < nn; k++ {
for l := 0; l < 8; l++ {
key := []byte{byte(i), byte(j), byte(k), byte(l)}
keys = append(keys, key)

value := i*nn*nn*nn + j*nn*nn + k*nn + l
values = append(values, value)
for k := 0; k < 16; k++ {
key := string([]byte{byte(i), byte(j), byte(k << 4)})
keys = append(keys, key)

}
value := i*nn*nn + j*nn + k
values = append(values, int32(value))
}

}
}

trie, err := NewTrie(keys, values, true)
if err != nil {
t.Fatalf("create new trie")
}
st, err := NewSlimTrie(encode.I32{}, keys, values)
ta.Nil(err)

st, err := NewSlimTrie(encode.Int{}, nil, nil)
if err != nil {
t.Fatalf("expected no error but: %+v", err)
}
ta.Equal(int32(1+16+256+4096+65536), st.Children.Cnt)
ta.Equal(int32(0), st.Steps.Cnt)
ta.Equal(int32(mx), st.Leaves.Cnt)

err = st.LoadTrie(trie)
if err != nil {
t.Fatalf("error: %s", err)
}
for i := 0; i < nn; i++ {
for j := 0; j < nn; j++ {
for k := 0; k < 16; k++ {

if st.Children.Cnt != 1+16+256+4096 {
t.Fatalf("children cnt should be %d", 1+16+256+4096)
}
if st.Steps.Cnt != int32(0) {
t.Fatalf("Steps cnt should be %d", mx)
}
if st.Leaves.Cnt != int32(mx) {
t.Fatalf("leaves cnt should be %d", mx)
key := string([]byte{byte(i), byte(j), byte(k << 4)})

v, _ := st.Get(key)
ta.Equal(values[i*nn*16+j*16+k], v)
}
}
}
}

Expand Down Expand Up @@ -508,9 +501,9 @@ func TestRangeGet(t *testing.T) {
{"bce", 3, true},
{"c", 4, true}, // false positive
{"cde", 4, true},
{"cfe", 4, true}, // false positive
{"cff", 4, true}, // false positive
{"def", nil, false}, // false positive
{"cfe", 4, true}, // false positive
{"cff", 4, true}, // false positive
{"def", 4, true}, // false positive
}

st, err := NewSlimTrie(encode.Int{}, keys, values)
Expand All @@ -528,6 +521,38 @@ func TestRangeGet(t *testing.T) {
}
}

func TestSlimTrie_RangeGet_rangeindex_bug_2019_05_21(t *testing.T) {

// RangeGet has bug found by Liu Baohai:

ta := require.New(t)

keys := []string{
"test/存界needleid00011end",

"test/山我needleid00009end",
"test/界世needleid00005end",
"test/白我needleid00006end",

"test/白测needleid00008end",
"test/试世needleid00014end",
}
values := []int32{
0,
1, 1, 1,
2, 2,
}

st, err := NewSlimTrie(encode.I32{}, keys, values)
ta.Nil(err)

for i, c := range keys {
rst, found := st.RangeGet(c)
ta.Equal(values[i], rst, "%d-th: search: %+v", i+1, c)
ta.Equal(true, found, "%d-th: search: %+v", i+1, c)
}
}

func TestNewSlimTrie(t *testing.T) {

st, err := NewSlimTrie(encode.Int{}, []string{"ab", "cd"}, []int{1, 2})
Expand Down
Loading