From 3af030a9ce6a9ece65deed3dd2b3bc1ebfc852c5 Mon Sep 17 00:00:00 2001 From: drdr xp Date: Tue, 21 May 2019 20:45:25 +0800 Subject: [PATCH 1/3] new-feature: trie: add removeSameLeaf() to remove leaves with the same value --- trie/trie.go | 57 ++++++++++++++++++++++++++++++++++++++++++++++- trie/trie_test.go | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/trie/trie.go b/trie/trie.go index 81766e8c..f78718fd 100644 --- a/trie/trie.go +++ b/trie/trie.go @@ -14,6 +14,7 @@ type Node struct { squash bool + // TODO inner node count. fix it NodeCnt int } @@ -26,7 +27,7 @@ const leafBranch = -1 // key. func NewTrie(keys [][]byte, values interface{}, squash bool) (root *Node, err error) { - root = &Node{Children: make(map[int]*Node), Step: 1, squash: squash} + root = &Node{Children: make(map[int]*Node), Step: 1, squash: squash, NodeCnt: 1} if keys == nil { return @@ -81,6 +82,60 @@ func (r *Node) Squash() int { return cnt } +// removeSameLeaf removes leaf that has the same value as preceding leaf. +// +// a ------->e =1 +// `>b------>f =2 +// `>c->d->g =2 // "g" and "d" is removed, c has other child and is kept. +// `--->h =3 +// +// Since 0.5.5 +func (r *Node) removeSameLeaf() { + + var prevValue interface{} = nil + + // wrapped as a generalized tree + s := &trieStringly{tnode: r} + + DepthFirst(s, + func(t Tree, parent, branch, node interface{}) { + + n := node.(*Node) + needRemove := false + + v, isLeaf := t.LeafVal(node) + if isLeaf { + if v == prevValue { + // same value no need to store + needRemove = true + } else { + prevValue = v + } + } else { + if len(n.Branches) == 0 { + needRemove = true + } + } + + if needRemove && parent != nil && branch != nil { + p := parent.(*Node) + b := branch.(int) + + delete(p.Children, b) + + for i, bb := range p.Branches { + if bb == b { + p.Branches = append(p.Branches[:i], p.Branches[i+1:]...) + } + } + if !isLeaf { + r.NodeCnt-- + } + + } + }) +} + // Search for `key` in a Trie. // // It returns 3 values of: diff --git a/trie/trie_test.go b/trie/trie_test.go index d104985e..4c8f0b56 100644 --- a/trie/trie_test.go +++ b/trie/trie_test.go @@ -6,6 +6,7 @@ import ( "github.com/openacid/errors" "github.com/openacid/slim/benchhelper" + "github.com/stretchr/testify/require" ) func TestTrie(t *testing.T) { @@ -527,3 +528,43 @@ func TestToStrings(t *testing.T) { t.Fatalf("expect: \n%v\n; but: \n%v\n", expect, trie.String()) } } + +func TestTrie_removeSameLeaf(t *testing.T) { + + ta := require.New(t) + + var keys = [][]byte{ + {'a', 'b', 'c'}, + {'a', 'b', 'c', 'd'}, + {'a', 'b', 'd'}, + {'a', 'b', 'd', 'e'}, + {'b', 'c'}, + {'b', 'c', 'd'}, + {'b', 'c', 'd', 'e'}, + {'c', 'd', 'e'}, + } + var values = []int{0, 0, 0, 3, 4, 5, 5, 5} + + want := ` +*2 +-097-> + -098->*2 + -099-> + -00$->=0 + -100-> + -101-> + -00$->=3 +-098-> + -099->*2 + -00$->=4 + -100-> + -00$->=5`[1:] + + trie, err := NewTrie(keys, values, false) + ta.Nil(err) + + trie.removeSameLeaf() + + ta.Equal(want, trie.String()) + ta.Equal(9, trie.NodeCnt, "non-leaf node count") +} From 2e6fb7c70bcd983eb0eae469930a3dde0ddf224d Mon Sep 17 00:00:00 2001 From: drdr xp Date: Tue, 21 May 2019 22:01:10 +0800 Subject: [PATCH 2/3] api-change: slimtrie: range based SlimTrie must provides all keys --- README.md | 2 + index/example_range_test.go | 2 + trie/example_slimtrie_range_test.go | 65 ++++++++++++----------------- trie/slimtrie.go | 45 ++++++-------------- trie/slimtrie_test.go | 38 +++++++++++++++-- 5 files changed, 78 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 387420bb..830617f2 100644 --- a/README.md +++ b/README.md @@ -379,6 +379,8 @@ func Example_indexRanges() { // Alison | {Key: "Aaron", Offset: 0}, + {Key: "Agatha", Offset: 0}, + {Key: "Al", Offset: 0}, {Key: "Albert", Offset: 0}, {Key: "Alexander", Offset: 31}, diff --git a/index/example_range_test.go b/index/example_range_test.go index e04c6883..46879c30 100644 --- a/index/example_range_test.go +++ b/index/example_range_test.go @@ -43,6 +43,8 @@ func Example_indexRanges() { // Alison | {Key: "Aaron", Offset: 0}, + {Key: "Agatha", Offset: 0}, + {Key: "Al", Offset: 0}, {Key: "Albert", Offset: 0}, {Key: "Alexander", Offset: 31}, diff --git a/trie/example_slimtrie_range_test.go b/trie/example_slimtrie_range_test.go index 4c3db0d8..c7f28b9c 100644 --- a/trie/example_slimtrie_range_test.go +++ b/trie/example_slimtrie_range_test.go @@ -10,30 +10,17 @@ func ExampleSlimTrie_RangeGet() { // To index a map of key range to value with SlimTrie is very simple: // - // Just give two adjacent keys the same value, then SlimTrie knows these - // keys belong to a "range". - // These two keys are left and right boundaries of a range, and are both - // inclusive. - // - // In this example we: - // - // map [abc, abcd] to 1 - // map [bc, bc] to 2 // this range has only one key in it. - // map [bcd, bce] to 3 - // - // With RangeGet() to get any key that "abc" <= key <= "abcd", such as - // "abc1", "abc2"... should return "1". - // - // False Positive - // - // Just like Bloomfilter, SlimTrie does not contains full information of keys, - // thus there could be a false positive return: - // It returns some value and "true" but the key is not in there. + // Gives a set of key the same value, and use RangeGet() instead of Get(). + // SlimTrie does not store branches for adjacent leaves with the same value. keys := []string{ - "abc", "abcd", + "abc", + "abcd", + "bc", - "bcd", "bce", + + "bcd", + "bce", } values := []int{ 1, 1, @@ -49,23 +36,23 @@ func ExampleSlimTrie_RangeGet() { key string msg string }{ - {"ab", "smaller than any"}, + {"ab", "FALSE POSITIVE: all known key starts with a are mapped to 1"}, - {"abc", "in range [abc, abcd]"}, - {"abc1", "in range [abc, abcd]"}, - {"abc2", "in range [abc, abcd]"}, - {"abcd", "in range [abc, abcd]"}, + {"abc", "in range"}, + {"abc1", "FALSE POSITIVE"}, + {"abc2", "FALSE POSITIVE"}, + {"abcd", "in range"}, {"abcde", "FALSE POSITIVE: a suffix of abcd"}, - {"acc", "FALSE POSITIVE: not in range [abc, abcd]"}, + {"acc", "FALSE POSITIVE"}, {"bc", "in single key range [bc]"}, - {"bc1", "not in single key range [bc]"}, + {"bc1", "FALSE POSITIVE"}, - {"bcd1", "in range [bcd, bce]"}, + {"bcd1", "FALSE POSITIVE"}, - {"def", "greater than any"}, + {"def", "FALSE POSITIVE"}, } for _, c := range cases { @@ -74,15 +61,15 @@ func ExampleSlimTrie_RangeGet() { } // Output: - // ab false: smaller than any - // abc 1 true : in range [abc, abcd] - // abc1 1 true : in range [abc, abcd] - // abc2 1 true : in range [abc, abcd] - // abcd 1 true : in range [abc, abcd] + // ab 1 true : FALSE POSITIVE: all known key starts with a are mapped to 1 + // abc 1 true : in range + // abc1 1 true : FALSE POSITIVE + // abc2 1 true : FALSE POSITIVE + // abcd 1 true : in range // abcde 1 true : FALSE POSITIVE: a suffix of abcd - // acc 1 true : FALSE POSITIVE: not in range [abc, abcd] + // acc 1 true : FALSE POSITIVE // bc 2 true : in single key range [bc] - // bc1 false: not in single key range [bc] - // bcd1 3 true : in range [bcd, bce] - // def false: greater than any + // bc1 2 true : FALSE POSITIVE + // bcd1 3 true : FALSE POSITIVE + // def 3 true : FALSE POSITIVE } diff --git a/trie/slimtrie.go b/trie/slimtrie.go index 6836d9ad..37e9bb23 100644 --- a/trie/slimtrie.go +++ b/trie/slimtrie.go @@ -15,20 +15,8 @@ // Actually besides as a key value map, // to index a map of key range to value with SlimTrie is also very simple: // -// Just give two adjacent keys the same value, then SlimTrie -// knows these keys belong to a "range". -// These two keys are left and right boundaries of a range, and are both -// inclusive. -// -// // a to g --> 1 -// // h --> 2 -// st, err := NewSlimTrie(encode.Int{}, []string{"a", "g", "h"}, []int{1, 1, 2}) -// -// st.Get("a") // 1, true A normal key-value Get() -// st.Get("c") // nil, false A key-value Get() got nothing. -// st.RangeGet("c") // 1, true A range get got 1 -// st.RangeGet("g") // 1, true -// st.RangeGet("h") // 2, true +// Gives a set of key the same value, and use RangeGet() instead of Get(). +// SlimTrie does not store branches for adjacent leaves with the same value. // // See SlimTrie.RangeGet . // @@ -123,6 +111,8 @@ func (st *SlimTrie) loadBytes(keys [][]byte, values interface{}) (err error) { return err } + trie.removeSameLeaf() + err = st.LoadTrie(trie) return err } @@ -230,11 +220,16 @@ func (st *SlimTrie) LoadTrie(root *Node) (err error) { // Since 0.4.3 func (st *SlimTrie) RangeGet(key string) (interface{}, bool) { - lID, eqID, rID := st.searchID(key) + lID, eqID, _ := st.searchID(key) // an "equal" macth means key is a prefix of either start or end of a range. if eqID != -1 { - return st.Leaves.Get(eqID) + v, found := st.Leaves.Get(eqID) + if found { + return v, found + } + + // else: maybe matched at a inner node. } // key is smaller than any range-start or range-end. @@ -242,24 +237,10 @@ func (st *SlimTrie) RangeGet(key string) (interface{}, bool) { return nil, false } - // key is greater than any range-start or range-end. - if rID == -1 { - return nil, false - } + // Preceding value is the start of this range. + // It might be a false-positive lVal, _ := st.Leaves.Get(lID) - rVal, _ := st.Leaves.Get(rID) - - // If left-value != right-value, the key is between a range-end and next - // range-start. - if lVal != rVal { - return nil, false - } - - // If range[i].end == range[i+1].start, it is a false positive. - // SlimTrie can not distinguish this from a positive match. - // - // Otherwise, lVal and rVal must be the start and end of a single range. return lVal, true } diff --git a/trie/slimtrie_test.go b/trie/slimtrie_test.go index 83f47163..65202ca3 100644 --- a/trie/slimtrie_test.go +++ b/trie/slimtrie_test.go @@ -508,9 +508,9 @@ func TestRangeGet(t *testing.T) { {"bce", 3, true}, {"c", 4, true}, // false positive {"cde", 4, true}, - {"cfe", 4, true}, // false positive - {"cff", 4, true}, // false positive - {"def", nil, false}, // false positive + {"cfe", 4, true}, // false positive + {"cff", 4, true}, // false positive + {"def", 4, true}, // false positive } st, err := NewSlimTrie(encode.Int{}, keys, values) @@ -528,6 +528,38 @@ func TestRangeGet(t *testing.T) { } } +func TestSlimTrie_RangeGet_rangeindex_bug_2019_05_21(t *testing.T) { + + // RangeGet has bug found by Liu Baohai: + + ta := require.New(t) + + keys := []string{ + "test/存界needleid00011end", + + "test/山我needleid00009end", + "test/界世needleid00005end", + "test/白我needleid00006end", + + "test/白测needleid00008end", + "test/试世needleid00014end", + } + values := []int32{ + 0, + 1, 1, 1, + 2, 2, + } + + st, err := NewSlimTrie(encode.I32{}, keys, values) + ta.Nil(err) + + for i, c := range keys { + rst, found := st.RangeGet(c) + ta.Equal(values[i], rst, "%d-th: search: %+v", i+1, c) + ta.Equal(true, found, "%d-th: search: %+v", i+1, c) + } +} + func TestNewSlimTrie(t *testing.T) { st, err := NewSlimTrie(encode.Int{}, []string{"ab", "cd"}, []int{1, 2}) From fe98e14906000c799ee7bea76a24f5e3b9947a3f Mon Sep 17 00:00:00 2001 From: drdr xp Date: Tue, 21 May 2019 22:36:37 +0800 Subject: [PATCH 3/3] new-feature: slimtrie: max key limit extends to 2^31 --- trie/errors.go | 2 +- trie/slimtrie.go | 4 +-- trie/slimtrie_test.go | 59 +++++++++++++++++++------------------------ 3 files changed, 29 insertions(+), 36 deletions(-) diff --git a/trie/errors.go b/trie/errors.go index 9392de7e..cc1501e7 100644 --- a/trie/errors.go +++ b/trie/errors.go @@ -5,7 +5,7 @@ import "github.com/openacid/errors" var ( // ErrTooManyTrieNodes indicates the number of trie nodes(not number of // keys) exceeded. - ErrTooManyTrieNodes = errors.New("exceeds max node count=65536") + ErrTooManyTrieNodes = errors.New("exceeds max node count=2^31-1") // ErrTrieBranchValueOverflow indicate input key consists of a word greater // than the max 4-bit word(0x0f). diff --git a/trie/slimtrie.go b/trie/slimtrie.go index 37e9bb23..a34b37d8 100644 --- a/trie/slimtrie.go +++ b/trie/slimtrie.go @@ -43,8 +43,8 @@ const ( WordMask = 0xf // LeafWord is a special value to indicate a leaf node in a Trie. LeafWord = byte(0x10) - // MaxNodeCnt is the max number of node(including leaf and inner node). - MaxNodeCnt = 65536 + // MaxNodeCnt is the max number of node. Node id in SlimTrie is int32. + MaxNodeCnt = (1 << 31) - 1 ) // SlimTrie is a space efficient Trie index. diff --git a/trie/slimtrie_test.go b/trie/slimtrie_test.go index 65202ca3..6db0155e 100644 --- a/trie/slimtrie_test.go +++ b/trie/slimtrie_test.go @@ -98,51 +98,44 @@ func unsquashedIntSlimTrie(t *testing.T, keys []string, values interface{}) *Sli func TestMaxKeys(t *testing.T) { - nn := 16 - mx := 32768 + ta := require.New(t) - keys := make([][]byte, 0, mx) - values := make([]interface{}, 0, mx) + nn := 256 + // a milllion keys + mx := nn * nn * 16 + + keys := make([]string, 0, mx) + values := make([]int32, 0, mx) for i := 0; i < nn; i++ { for j := 0; j < nn; j++ { - for k := 0; k < nn; k++ { - for l := 0; l < 8; l++ { - key := []byte{byte(i), byte(j), byte(k), byte(l)} - keys = append(keys, key) + for k := 0; k < 16; k++ { + key := string([]byte{byte(i), byte(j), byte(k << 4)}) + keys = append(keys, key) - value := i*nn*nn*nn + j*nn*nn + k*nn + l - values = append(values, value) - - } + value := i*nn*nn + j*nn + k + values = append(values, int32(value)) } - } } - trie, err := NewTrie(keys, values, true) - if err != nil { - t.Fatalf("create new trie") - } + st, err := NewSlimTrie(encode.I32{}, keys, values) + ta.Nil(err) - st, err := NewSlimTrie(encode.Int{}, nil, nil) - if err != nil { - t.Fatalf("expected no error but: %+v", err) - } + ta.Equal(int32(1+16+256+4096+65536), st.Children.Cnt) + ta.Equal(int32(0), st.Steps.Cnt) + ta.Equal(int32(mx), st.Leaves.Cnt) - err = st.LoadTrie(trie) - if err != nil { - t.Fatalf("error: %s", err) - } + for i := 0; i < nn; i++ { + for j := 0; j < nn; j++ { + for k := 0; k < 16; k++ { - if st.Children.Cnt != 1+16+256+4096 { - t.Fatalf("children cnt should be %d", 1+16+256+4096) - } - if st.Steps.Cnt != int32(0) { - t.Fatalf("Steps cnt should be %d", mx) - } - if st.Leaves.Cnt != int32(mx) { - t.Fatalf("leaves cnt should be %d", mx) + key := string([]byte{byte(i), byte(j), byte(k << 4)}) + + v, _ := st.Get(key) + ta.Equal(values[i*nn*16+j*16+k], v) + } + } } }