From 840f8f26d17b3c80e50cf25975ff9242d2a531fb Mon Sep 17 00:00:00 2001 From: drdr xp Date: Mon, 27 May 2019 23:45:17 +0800 Subject: [PATCH] refactor: slimtrie: create without trie, speed up by 30 times; add slimtrie building benchmark --- trie/bench_slimtrie_new_test.go | 29 ++++ trie/errors.go | 2 +- trie/slimtrie.go | 225 +++++++++++++++++++++++++++++--- trie/slimtrie_test.go | 76 +++++++++-- 4 files changed, 300 insertions(+), 32 deletions(-) create mode 100644 trie/bench_slimtrie_new_test.go diff --git a/trie/bench_slimtrie_new_test.go b/trie/bench_slimtrie_new_test.go new file mode 100644 index 00000000..176193d6 --- /dev/null +++ b/trie/bench_slimtrie_new_test.go @@ -0,0 +1,29 @@ +package trie_test + +import ( + "testing" + + "github.com/openacid/slim/encode" + "github.com/openacid/slim/trie" +) + +var Output int + +func BenchmarkNewSlimTrie(b *testing.B) { + keys := words2 + values := make([]uint32, len(keys)) + for i := 0; i < len(keys); i++ { + values[i] = uint32(i) + } + b.ResetTimer() + var s int + for i := 0; i < b.N; i++ { + st, err := trie.NewSlimTrie(encode.U32{}, keys, values) + if err != nil { + panic(err) + } + s += int(st.Children.Cnt) + } + + Output = s +} diff --git a/trie/errors.go b/trie/errors.go index cc1501e7..c2d8e043 100644 --- a/trie/errors.go +++ b/trie/errors.go @@ -1,6 +1,6 @@ package trie -import "github.com/openacid/errors" +import "errors" var ( // ErrTooManyTrieNodes indicates the number of trie nodes(not number of diff --git a/trie/slimtrie.go b/trie/slimtrie.go index 458332c0..db8b5b40 100644 --- a/trie/slimtrie.go +++ b/trie/slimtrie.go @@ -29,7 +29,9 @@ package trie import ( "bytes" + "fmt" "math/bits" + "reflect" "github.com/openacid/errors" "github.com/openacid/low/bitword" @@ -85,40 +87,223 @@ type SlimTrie struct { // // Since 0.2.0 func NewSlimTrie(e encode.Encoder, keys []string, values interface{}) (*SlimTrie, error) { + return newSlimTrie(e, keys, values) +} - st := &SlimTrie{ - Steps: array.U16{}, - Leaves: array.Array{}, +type subset struct { + keyStart int + keyEnd int + fromIndex int +} + +func newSlimTrie(e encode.Encoder, keys []string, values interface{}) (*SlimTrie, error) { + + n := len(keys) + if n == 0 { + return emptySlimTrie(e), nil + } + + for i := 0; i < len(keys)-1; i++ { + if keys[i] >= keys[i+1] { + return nil, errors.Wrapf(ErrKeyOutOfOrder, + "keys[%d] >= keys[%d] %s %s", i, i+1, keys[i], keys[i+1]) + } + } + + rvals := checkValues(reflect.ValueOf(values), n) + tokeep := newValueToKeep(rvals) + + childi := make([]int32, 0, n) + childv := make([]uint64, 0, n) + + stepi := make([]int32, 0, n) + stepv := make([]uint16, 0, n) + + leavesi := make([]int32, 0, n) + leavesv := make([]interface{}, 0, n) + + queue := make([]subset, 0, n*2) + queue = append(queue, subset{0, n, 0}) + + for i := 0; i < len(queue); i++ { + nid := int32(i) + o := queue[i] + s, e := o.keyStart, o.keyEnd + + // single key, it is a leaf + if e-s == 1 { + if tokeep[s] { + leavesi = append(leavesi, nid) + leavesv = append(leavesv, getV(rvals, s)) + } + continue + } + + // need to create an inner node + + prefI := prefixIndex(keys[s:e], o.fromIndex) + + // the first key is a prefix of all other keys, which makes it a leaf. + isFirstKeyALeaf := len(keys[s])*8/4 == prefI + if isFirstKeyALeaf { + if tokeep[s] { + leavesi = append(leavesi, nid) + leavesv = append(leavesv, getV(rvals, s)) + } + s += 1 + } + + // create inner node from following keys + + labels, labelBitmap := getLabels(keys[s:e], prefI, tokeep[s:e]) + + hasChildren := len(labels) > 0 + + if hasChildren { + childi = append(childi, nid) + childv = append(childv, uint64(labelBitmap)) + + // put keys with the same starting word to queue. + + for _, label := range labels { + + // Find the first key starting with label + for ; s < e; s++ { + word := bw4.Get(keys[s], prefI) + if word == label { + break + } + } + + // Continue looking for the first key not starting with label + var j int + for j = s + 1; j < e; j++ { + word := bw4.Get(keys[j], prefI) + if word != label { + break + } + } + + p := subset{ + keyStart: s, + keyEnd: j, + fromIndex: prefI + 1, // skip the label word + } + queue = append(queue, p) + s = j + } + + // 1 for the label word at parent node + step := (prefI - o.fromIndex) + 1 + if step > 0xffff { + panic(fmt.Sprintf("step=%d is too large. must < 2^16", step)) + } + + // By default to move 1 step forward, thus no need to store 1 + hasStep := step > 1 + if hasStep { + stepi = append(stepi, nid) + stepv = append(stepv, uint16(step)) + } + } + } + + ch, err := array.NewBitmap16(childi, childv, 16) + if err != nil { + return nil, err } - st.Leaves.EltEncoder = e - if keys != nil { - return st, st.load(keys, values) + steps, err := array.NewU16(stepi, stepv) + if err != nil { + return nil, err } + leaves := array.Array{} + leaves.EltEncoder = e + + err = leaves.Init(leavesi, leavesv) + if err != nil { + return nil, errors.Wrapf(err, "failure init leaves") + } + + st := &SlimTrie{ + Children: *ch, + Steps: *steps, + Leaves: leaves, + } return st, nil } -// load Loads keys and values and builds a SlimTrie. -// -// values must be a slice of data-type of fixed size or compatible with -// SlimTrie.Leaves.Encoder. -func (st *SlimTrie) load(keys []string, values interface{}) (err error) { - ks := bw4.FromStrs(keys) - return st.load4bitWords(ks, values) +func checkValues(rvals reflect.Value, n int) reflect.Value { + + if rvals.Kind() != reflect.Slice { + panic("values is not a slice") + } + + valn := rvals.Len() + + if n != valn { + panic(fmt.Sprintf("len(keys) != len(values): %d, %d", n, valn)) + } + return rvals + } -func (st *SlimTrie) load4bitWords(keys [][]byte, values interface{}) (err error) { +// newValueToKeep creates a slice indicating which key to keep. +// Value of key[i+1] with the same value with key[i] do not need to keep. +func newValueToKeep(rvals reflect.Value) []bool { - trie, err := NewTrie(keys, values, true) - if err != nil { - return err + n := rvals.Len() + + tokeep := make([]bool, n) + tokeep[0] = true + + for i := 0; i < n-1; i++ { + tokeep[i+1] = getV(rvals, i+1) != getV(rvals, i) + } + return tokeep +} + +func getV(reflectSlice reflect.Value, i int) interface{} { + return reflectSlice.Index(i).Interface() +} + +func emptySlimTrie(e encode.Encoder) *SlimTrie { + st := &SlimTrie{} + st.Leaves.EltEncoder = e + return st +} + +func prefixIndex(keys []string, from int) int { + if len(keys) == 1 { + return len(keys[0]) } - trie.removeSameLeaf() + n := len(keys) - err = st.LoadTrie(trie) - return err + end := bw4.FirstDiff(keys[0], keys[n-1], from, -1) + return end +} + +func getLabels(keys []string, from int, tokeep []bool) ([]byte, uint16) { + labels := make([]byte, 0, 1<<4) + bitmap := uint16(0) + + for i, k := range keys { + + if !tokeep[i] { + continue + } + + word := bw4.Get(k, from) + b := uint16(1) << word + if bitmap&b == 0 { + labels = append(labels, word) + bitmap |= b + } + + } + return labels, bitmap } // LoadTrie compress a standard Trie and store compressed data in it. diff --git a/trie/slimtrie_test.go b/trie/slimtrie_test.go index 8507650c..7302f561 100644 --- a/trie/slimtrie_test.go +++ b/trie/slimtrie_test.go @@ -460,7 +460,9 @@ func TestSlimTrieSearch(t *testing.T) { } } -func TestRangeGet(t *testing.T) { +func TestRangeGet_search(t *testing.T) { + + ta := require.New(t) keys := []string{ "abc", @@ -506,9 +508,19 @@ func TestRangeGet(t *testing.T) { } st, err := NewSlimTrie(encode.Int{}, keys, values) - if err != nil { - t.Fatalf("expected no error but: %+v", err) - } + ta.Nil(err) + + wantstr := ` +#000+2*3 + -001->#001+4*2 + -003->#004=0 + -004->#005=1 + -002->#002+3=2 + -006->#006 + -004->#007=3 + -003->#003=4`[1:] + ta.Equal(wantstr, st.String()) + for i, c := range searches { rst, found := st.RangeGet(c.key) if c.want != rst { @@ -519,6 +531,43 @@ func TestRangeGet(t *testing.T) { } } } +func TestSlimTrie_RangeGet_leafNotToKeep(t *testing.T) { + + ta := require.New(t) + + // case: a key not to keep and is a leaf: "Al" + keys := []string{ + "Aaron", + "Agatha", + "Al", + "Albert", + + "Alexander", + "Alison", + } + values := []int32{ + 0, 0, 0, 0, + 1, 1, + } + + st, err := NewSlimTrie(encode.I32{}, keys, values) + ta.Nil(err) + + wantstr := ` +#000+4*2 + -001->#001=0 + -012->#002 + -006->#003 + -005->#004=1`[1:] + + ta.Equal(wantstr, st.String()) + + for i, c := range keys { + rst, found := st.RangeGet(c) + ta.Equal(values[i], rst, "%d-th: search: %+v", i+1, c) + ta.Equal(true, found, "%d-th: search: %+v", i+1, c) + } +} func TestSlimTrie_RangeGet_rangeindex_bug_2019_05_21(t *testing.T) { @@ -545,6 +594,17 @@ func TestSlimTrie_RangeGet_rangeindex_bug_2019_05_21(t *testing.T) { st, err := NewSlimTrie(encode.I32{}, keys, values) ta.Nil(err) + wantstr := ` +#000+12*2 + -005->#001*2 + -010->#003=0 + -011->#004=1 + -007->#002+2 + -009->#005+5 + -011->#006=2`[1:] + + ta.Equal(wantstr, st.String()) + for i, c := range keys { rst, found := st.RangeGet(c) ta.Equal(values[i], rst, "%d-th: search: %+v", i+1, c) @@ -597,21 +657,15 @@ func TestNewSlimTrie(t *testing.T) { } func TestSlimTrieError(t *testing.T) { - cases := []struct { keys []string values []int wanterr error }{ - { - []string{"a", "a"}, - []int{1}, - ErrKVLenNotMatch, - }, { []string{"a", "a"}, []int{1, 2}, - ErrDuplicateKeys, + ErrKeyOutOfOrder, }, { []string{"ab", "a"},