forked from cayleygraph/cayley
-
Notifications
You must be signed in to change notification settings - Fork 0
/
and_iterator_optimize.go
355 lines (315 loc) · 10.8 KB
/
and_iterator_optimize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// Copyright 2014 The Cayley Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package iterator
import (
"sort"
"github.com/barakmich/glog"
"github.com/google/cayley/graph"
)
// Perhaps the most tricky file in this entire module. Really a method on the
// And, but important enough to deserve its own file.
//
// Calling Optimize() on an And iterator, like any iterator, requires that we
// preserve the underlying meaning. However, the And has many choices, namely,
// which one of it's subiterators will be the branch that does the Next()ing,
// and which ordering of the remaining iterators is the most efficient. In
// short, this is where a lot of the query optimization happens, and there are
// many wins to be had here, as well as many bad bugs. The worst class of bug
// changes the meaning of the query. The second worst class makes things really
// slow.
//
// The good news is this: If Optimize() is never called (turned off, perhaps) we can
// be sure the results are as good as the query language called for.
//
// In short, tread lightly.
// Optimizes the And, by picking the most efficient way to Next() and
// Contains() its subiterators. For SQL fans, this is equivalent to JOIN.
func (it *And) Optimize() (graph.Iterator, bool) {
// First, let's get the slice of iterators, in order (first one is Next()ed,
// the rest are Contains()ed)
old := it.SubIterators()
// And call Optimize() on our subtree, replacing each one in the order we
// found them. it_list is the newly optimized versions of these, and changed
// is another list, of only the ones that have returned replacements and
// changed.
its := optimizeSubIterators(old)
// Close the replaced iterators (they ought to close themselves, but Close()
// is idempotent, so this just protects against any machinations).
closeIteratorList(old, nil)
// If we can find only one subiterator which is equivalent to this whole and,
// we can replace the And...
out := it.optimizeReplacement(its)
if out != nil {
// ...Move the tags to the replacement...
moveTagsTo(out, it)
// ...Close everyone except `out`, our replacement...
closeIteratorList(its, out)
// ...And return it.
return out, true
}
// And now, without changing any of the iterators, we reorder them. it_list is
// now a permutation of itself, but the contents are unchanged.
its = it.optimizeOrder(its)
its = materializeIts(its)
// Okay! At this point we have an optimized order.
// The easiest thing to do at this point is merely to create a new And iterator
// and replace ourselves with our (reordered, optimized) clone.
newAnd := NewAnd()
// Add the subiterators in order.
for _, sub := range its {
newAnd.AddSubIterator(sub)
}
// Move the tags hanging on us (like any good replacement).
newAnd.tags.CopyFrom(it)
newAnd.optimizeContains()
glog.V(3).Infoln(it.UID(), "became", newAnd.UID())
// And close ourselves but not our subiterators -- some may still be alive in
// the new And (they were unchanged upon calling Optimize() on them, at the
// start).
it.cleanUp()
return newAnd, true
}
// Closes a list of iterators, except the one passed in `except`. Closes all
// of the iterators in the list if `except` is nil.
func closeIteratorList(its []graph.Iterator, except graph.Iterator) {
for _, it := range its {
if it != except {
it.Close()
}
}
}
// Find if there is a single subiterator which is a valid replacement for this
// And.
func (_ *And) optimizeReplacement(its []graph.Iterator) graph.Iterator {
// If we were created with no SubIterators, we're as good as Null.
if len(its) == 0 {
return &Null{}
}
if len(its) == 1 {
// When there's only one iterator, there's only one choice.
return its[0]
}
// If any of our subiterators, post-optimization, are also Null, then
// there's no point in continuing the branch, we will have no results
// and we are null as well.
if hasAnyNullIterators(its) {
return &Null{}
}
// If we have one useful iterator, use that.
it := hasOneUsefulIterator(its)
if it != nil {
return it
}
return nil
}
// optimizeOrder(l) takes a list and returns a list, containing the same contents
// but with a new ordering, however it wishes.
func (it *And) optimizeOrder(its []graph.Iterator) []graph.Iterator {
var (
// bad contains iterators that can't be (efficiently) nexted, such as
// graph.Optional or graph.Not. Separate them out and tack them on at the end.
out, bad []graph.Iterator
best graph.Iterator
bestCost = int64(1 << 62)
)
// Find the iterator with the projected "best" total cost.
// Total cost is defined as The Next()ed iterator's cost to Next() out
// all of it's contents, and to Contains() each of those against everyone
// else.
for _, root := range its {
if _, canNext := root.(graph.Nexter); !canNext {
bad = append(bad, root)
continue
}
rootStats := root.Stats()
cost := rootStats.NextCost
for _, f := range its {
if _, canNext := f.(graph.Nexter); !canNext {
continue
}
if f == root {
continue
}
stats := f.Stats()
cost += stats.ContainsCost * (1 + (rootStats.Size / (stats.Size + 1)))
}
cost *= rootStats.Size
if glog.V(3) {
glog.V(3).Infoln("And:", it.UID(), "Root:", root.UID(), "Total Cost:", cost, "Best:", bestCost)
}
if cost < bestCost {
best = root
bestCost = cost
}
}
if glog.V(3) {
glog.V(3).Infoln("And:", it.UID(), "Choosing:", best.UID(), "Best:", bestCost)
}
// TODO(barakmich): Optimization of order need not stop here. Picking a smart
// Contains() order based on probability of getting a false Contains() first is
// useful (fail faster).
// Put the best iterator (the one we wish to Next()) at the front...
out = append(out, best)
// ... push everyone else after...
for _, it := range its {
if _, canNext := it.(graph.Nexter); !canNext {
continue
}
if it != best {
out = append(out, it)
}
}
// ...and finally, the difficult children on the end.
return append(out, bad...)
}
type byCost []graph.Iterator
func (c byCost) Len() int { return len(c) }
func (c byCost) Less(i, j int) bool { return c[i].Stats().ContainsCost < c[j].Stats().ContainsCost }
func (c byCost) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
// optimizeContains() creates an alternate check list, containing the same contents
// but with a new ordering, however it wishes.
func (it *And) optimizeContains() {
// GetSubIterators allocates, so this is currently safe.
// TODO(kortschak) Reuse it.checkList if possible.
// This involves providing GetSubIterators with a slice to fill.
// Generally this is a worthwhile thing to do in other places as well.
it.checkList = it.SubIterators()
sort.Sort(byCost(it.checkList))
}
// If we're replacing ourselves by a single iterator, we need to grab the
// result tags from the iterators that, while still valid and would hold
// the same values as this and, are not going to stay.
// getSubTags() returns a map of the tags for all the subiterators.
func (it *And) getSubTags() map[string]struct{} {
tags := make(map[string]struct{})
for _, sub := range it.SubIterators() {
for _, tag := range sub.Tagger().Tags() {
tags[tag] = struct{}{}
}
}
for _, tag := range it.tags.Tags() {
tags[tag] = struct{}{}
}
return tags
}
// moveTagsTo() gets the tags for all of the src's subiterators and the
// src itself, and moves them to dst.
func moveTagsTo(dst graph.Iterator, src *And) {
tags := src.getSubTags()
for _, tag := range dst.Tagger().Tags() {
if _, ok := tags[tag]; ok {
delete(tags, tag)
}
}
dt := dst.Tagger()
for k := range tags {
dt.Add(k)
}
}
// optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all
// of them. It returns two lists -- the first contains the same list as l, where
// any replacements are made by Optimize() and the second contains the originals
// which were replaced.
func optimizeSubIterators(its []graph.Iterator) []graph.Iterator {
var optIts []graph.Iterator
for _, it := range its {
o, changed := it.Optimize()
if changed {
optIts = append(optIts, o)
} else {
optIts = append(optIts, it.Clone())
}
}
return optIts
}
// Check a list of iterators for any Null iterators.
func hasAnyNullIterators(its []graph.Iterator) bool {
for _, it := range its {
if it.Type() == graph.Null {
return true
}
}
return false
}
// There are two "not-useful" iterators -- namely graph.Null which returns
// nothing, and graph.All which returns everything. Particularly, we want
// to see if we're intersecting with a bunch of graph.All iterators, and,
// if we are, then we have only one useful iterator.
func hasOneUsefulIterator(its []graph.Iterator) graph.Iterator {
usefulCount := 0
var usefulIt graph.Iterator
for _, it := range its {
switch it.Type() {
case graph.Null, graph.All:
continue
case graph.Optional:
// Optional is weird -- it's not useful, but we can't optimize
// away from it. Therefore, we skip this optimization
// if we see one.
return nil
default:
usefulCount++
usefulIt = it
}
}
if usefulCount == 1 {
return usefulIt
}
return nil
}
func materializeIts(its []graph.Iterator) []graph.Iterator {
var out []graph.Iterator
allStats := getStatsForSlice(its)
out = append(out, its[0])
for _, it := range its[1:] {
stats := it.Stats()
if stats.Size*stats.NextCost < (stats.ContainsCost * (1 + (stats.Size / (allStats.Size + 1)))) {
if graph.Height(it, graph.Materialize) > 10 {
out = append(out, NewMaterialize(it))
continue
}
}
out = append(out, it)
}
return out
}
func getStatsForSlice(its []graph.Iterator) graph.IteratorStats {
primary := its[0]
primaryStats := primary.Stats()
ContainsCost := primaryStats.ContainsCost
NextCost := primaryStats.NextCost
Size := primaryStats.Size
for _, sub := range its[1:] {
stats := sub.Stats()
NextCost += stats.ContainsCost * (1 + (primaryStats.Size / (stats.Size + 1)))
ContainsCost += stats.ContainsCost
if Size > stats.Size {
Size = stats.Size
}
}
return graph.IteratorStats{
ContainsCost: ContainsCost,
NextCost: NextCost,
Size: Size,
}
}
// and.Stats() lives here in and-iterator-optimize.go because it may
// in the future return different statistics based on how it is optimized.
// For now, however, it's pretty static.
func (it *And) Stats() graph.IteratorStats {
stats := getStatsForSlice(it.SubIterators())
stats.Next = it.runstats.Next
stats.Contains = it.runstats.Contains
return stats
}