From d68cada7db07688959a08a52022c853ded6fa1a4 Mon Sep 17 00:00:00 2001 From: Nikita Melekhin Date: Fri, 3 Sep 2021 18:28:14 +0300 Subject: [PATCH] cmd/compile: boost inlining into FORs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. There is "big" FOR which cost is >= inlineBigForCost(105). In such FORs no boost is applied. Updates #17566 The following results on GO1, while binary size not increased significantly 10454800 -> 10475120, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.17s ± 1% ~ (p=0.065 n=6+6) Fannkuch11-8 2.70s ± 0% 2.69s ± 0% -0.25% (p=0.010 n=6+4) FmtFprintfEmpty-8 31.9ns ± 0% 31.4ns ± 0% -1.61% (p=0.008 n=5+5) FmtFprintfString-8 57.0ns ± 0% 57.1ns ± 0% +0.26% (p=0.013 n=6+5) FmtFprintfInt-8 65.2ns ± 0% 63.9ns ± 0% -1.95% (p=0.008 n=5+5) FmtFprintfIntInt-8 103ns ± 0% 102ns ± 0% -1.01% (p=0.000 n=5+4) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.50% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 174ns ± 0% +2.75% (p=0.008 n=5+5) FmtManyArgs-8 445ns ± 0% 447ns ± 0% +0.46% (p=0.002 n=6+6) GobDecode-8 4.37ms ± 1% 4.40ms ± 0% +0.62% (p=0.009 n=6+6) GobEncode-8 3.07ms ± 0% 3.04ms ± 0% -0.78% (p=0.004 n=5+6) Gzip-8 195ms ± 0% 195ms ± 0% ~ (p=0.429 n=5+6) Gunzip-8 28.2ms ± 0% 28.2ms ± 0% ~ (p=0.662 n=5+6) HTTPClientServer-8 45.0µs ± 1% 45.4µs ± 1% ~ (p=0.093 n=6+6) JSONEncode-8 8.01ms ± 0% 8.03ms ± 0% +0.31% (p=0.008 n=5+5) JSONDecode-8 35.3ms ± 1% 35.1ms ± 0% -0.72% (p=0.008 n=5+5) Mandelbrot200-8 4.50ms ± 0% 4.49ms ± 1% ~ (p=0.937 n=6+6) GoParse-8 3.03ms ± 1% 3.00ms ± 1% ~ (p=0.180 n=6+6) RegexpMatchEasy0_32-8 55.4ns ± 0% 53.2ns ± 3% -3.92% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 178ns ± 0% 175ns ± 1% -1.57% (p=0.004 n=5+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 48.3ns ± 5% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 271ns ± 1% 262ns ± 1% -3.26% (p=0.004 n=6+5) RegexpMatchMedium_32-8 949ns ± 0% 886ns ± 7% ~ (p=0.329 n=5+6) RegexpMatchMedium_1K-8 27.1µs ± 7% 28.1µs ± 6% ~ (p=0.394 n=6+6) RegexpMatchHard_32-8 1.28µs ± 2% 1.29µs ± 0% ~ (p=0.056 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.4µs ± 0% -0.25% (p=0.009 n=6+5) Revcomp-8 397ms ± 0% 396ms ± 0% ~ (p=0.429 n=6+5) Template-8 48.1ms ± 1% 48.1ms ± 0% ~ (p=0.222 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.210 n=4+6) TimeFormat-8 295ns ± 1% 259ns ± 0% -12.22% (p=0.002 n=6+6) [Geo mean] 40.5µs 40.1µs -1.00% name old speed new speed delta GobDecode-8 176MB/s ± 1% 174MB/s ± 0% -0.61% (p=0.009 n=6+6) GobEncode-8 250MB/s ± 0% 252MB/s ± 0% +0.79% (p=0.004 n=5+6) Gzip-8 100MB/s ± 0% 100MB/s ± 0% ~ (p=0.351 n=5+6) Gunzip-8 687MB/s ± 0% 687MB/s ± 0% ~ (p=0.662 n=5+6) JSONEncode-8 242MB/s ± 0% 242MB/s ± 0% -0.31% (p=0.008 n=5+5) JSONDecode-8 54.9MB/s ± 1% 55.3MB/s ± 0% +0.71% (p=0.008 n=5+5) GoParse-8 19.1MB/s ± 1% 19.3MB/s ± 1% ~ (p=0.143 n=6+6) RegexpMatchEasy0_32-8 578MB/s ± 0% 601MB/s ± 3% +4.10% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 5.74GB/s ± 1% 5.85GB/s ± 1% +1.90% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 663MB/s ± 4% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 3.78GB/s ± 1% 3.91GB/s ± 1% +3.38% (p=0.004 n=6+5) RegexpMatchMedium_32-8 33.7MB/s ± 0% 36.2MB/s ± 7% ~ (p=0.268 n=5+6) RegexpMatchMedium_1K-8 37.9MB/s ± 6% 36.5MB/s ± 6% ~ (p=0.411 n=6+6) RegexpMatchHard_32-8 24.9MB/s ± 2% 24.8MB/s ± 0% ~ (p=0.063 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.7MB/s ± 0% +0.25% (p=0.009 n=6+5) Revcomp-8 640MB/s ± 0% 641MB/s ± 0% ~ (p=0.429 n=6+5) Template-8 40.4MB/s ± 1% 40.3MB/s ± 0% ~ (p=0.222 n=5+5) [Geo mean] 175MB/s 177MB/s +1.05% --- src/cmd/compile/internal/inline/inl.go | 207 +++++++++++++++++++------ test/inline.go | 54 +++++++ test/inline_for.go | 39 +++++ 3 files changed, 255 insertions(+), 45 deletions(-) create mode 100644 test/inline_for.go diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go index a2268a5465e02..c4b07b1ddcb66 100644 --- a/src/cmd/compile/internal/inline/inl.go +++ b/src/cmd/compile/internal/inline/inl.go @@ -45,14 +45,107 @@ const ( inlineMaxBudget = 80 inlineExtraAppendCost = 0 // default is to inline if there's at most one call. -l=4 overrides this by using 1 instead. - inlineExtraCallCost = 57 // 57 was benchmarked to provided most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742 + inlineExtraCallCost = 57 // 57 was benchmarked to provide most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742 inlineExtraPanicCost = 1 // do not penalize inlining panics. inlineExtraThrowCost = inlineMaxBudget // with current (2018-05/1.11) code, inlining runtime.throw does not help. inlineBigFunctionNodes = 5000 // Functions with this many nodes are considered "big". inlineBigFunctionMaxCost = 20 // Max cost of inlinee when inlining into a "big" function. + + // These values were benchmarked to provide most benefit with no bad surprises. + inlineBigForCost = 105 // FORs with at least this cost are considered "big". + inlineIntoForExtraCallCost = 14 + inlineIntoForExtraBudget = 18 // Extra budget when inlining into FORs which are not "big". + + // The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined. + inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget ) +// isInlinable checks if the function can be inlined in a 'typical' scenario +// when no boosts are applied. +func isInlinable(fn *ir.Func) bool { + return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget +} + +type forContext struct { + cost int32 // Helps to determine if FOR is a "big" one. +} + +type inlContext struct { + // Map to keep track of functions that have been inlined at a particular + // call site, in order to stop inlining when we reach the beginning of a + // recursion cycle again. We don't inline immediately recursive functions, + // but allow inlining if there is a recursion cycle of many functions. + // Most likely, the inlining will stop before we even hit the beginning of + // the cycle again, but the map catches the unusual case. + inlinedCallees map[*ir.Func]bool + + // Stack to recognise which call nodes are located inside fors, while doing inlnode. + forsStack []forContext + initialInlineBudget int32 // Initial inline budget. Boosts are calculated related to this. +} + +// Current decision is made on whether all FORs in current scope are not "big". +func (ctx inlContext) canBoostInliningIntoFor() bool { + for i := 0; i < len(ctx.forsStack); i++ { + if ctx.forsStack[i].cost >= inlineBigForCost { + return false + } + } + return len(ctx.forsStack) > 0 +} + +func (ctx *inlContext) Init(fn *ir.Func) { + ctx.inlinedCallees = make(map[*ir.Func]bool) + + if isBigFunc(fn) { + ctx.initialInlineBudget = inlineBigFunctionMaxCost + } else { + ctx.initialInlineBudget = inlineMaxBudget + } +} + +func (ctx *inlContext) PushFor(n ir.Node) { + ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)}) + + if base.Flag.LowerM > 1 { + fmt.Printf("%v: add FOR to stack %v\n", ir.Line(n), ctx.forsStack) + } +} + +func (ctx *inlContext) PopFor() { + ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1] +} + +func (ctx inlContext) InlineBudget() int32 { + finalBudget := ctx.initialInlineBudget + if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget { + // Boosts only regular functions + finalBudget += inlineIntoForExtraBudget + } + + return finalBudget +} + +// forCost calculates the cost of FORs. It is used to determine if functions +// will be boosted to inline into the FOR. +// We don't want to boost inlining into "big" FORs to keep their body +// in the instruction cache. +func forCost(n ir.Node) int32 { + exceededCostReason := func(remainingBudget int32) string { + return fmt.Sprintf("FOR is big: cost %d exceeds maximum cost %d", inlineBigForCost-remainingBudget, inlineBigForCost) + } + + visitor := hairyVisitor{ + budget: inlineBigForCost, + extraCallCost: inlineIntoForExtraCallCost, + onlyCost: true, + exceededCostReasonCallback: exceededCostReason, + } + visitor.tooHairy(n) + return inlineBigForCost - visitor.budget +} + // InlinePackage finds functions that can be inlined and clones them before walk expands them. func InlinePackage() { ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) { @@ -166,9 +259,15 @@ func CanInline(fn *ir.Func) { // locals, and we use this map to produce a pruned Inline.Dcl // list. See issue 25249 for more context. + exceededCostReason := func(remainingBudget int32) string { + return fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-remainingBudget, inlineVisitorBudget) + } + visitor := hairyVisitor{ - budget: inlineMaxBudget, - extraCallCost: cc, + budget: inlineVisitorBudget, + extraCallCost: cc, + onlyCost: false, + exceededCostReasonCallback: exceededCostReason, } if visitor.tooHairy(fn) { reason = visitor.reason @@ -176,7 +275,7 @@ func CanInline(fn *ir.Func) { } n.Func.Inl = &ir.Inline{ - Cost: inlineMaxBudget - visitor.budget, + Cost: inlineVisitorBudget - visitor.budget, Dcl: pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor), Body: inlcopylist(fn.Body), @@ -184,12 +283,16 @@ func CanInline(fn *ir.Func) { } if base.Flag.LowerM > 1 { - fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body)) - } else if base.Flag.LowerM != 0 { + if isInlinable(n.Func) { + fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body)) + } else { + fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body)) + } + } else if base.Flag.LowerM != 0 && isInlinable(n.Func) { fmt.Printf("%v: can inline %v\n", ir.Line(fn), n) } if logopt.Enabled() { - logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget)) + logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost)) } } @@ -228,20 +331,22 @@ func canDelayResults(fn *ir.Func) bool { // hairyVisitor visits a function body to determine its inlining // hairiness and whether or not it can be inlined. type hairyVisitor struct { - budget int32 - reason string - extraCallCost int32 - usedLocals ir.NameSet - do func(ir.Node) bool + budget int32 + extraCallCost int32 + onlyCost bool // If set, tooHairy does NOT check inlinible nodes, only cost. + reason string + usedLocals ir.NameSet + do func(ir.Node) bool + exceededCostReasonCallback func(remainingBudget int32) string } -func (v *hairyVisitor) tooHairy(fn *ir.Func) bool { +func (v *hairyVisitor) tooHairy(n ir.Node) bool { v.do = v.doNode // cache closure - if ir.DoChildren(fn, v.do) { + if ir.DoChildren(n, v.do) { return true } if v.budget < 0 { - v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget) + v.reason = v.exceededCostReasonCallback(v.budget) return true } return false @@ -264,8 +369,12 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { if name.Class == ir.PFUNC && types.IsRuntimePkg(name.Sym().Pkg) { fn := name.Sym().Name if fn == "getcallerpc" || fn == "getcallersp" { - v.reason = "call to " + fn - return true + if !v.onlyCost { + v.reason = "call to " + fn + return true + } else { + break + } } if fn == "throw" { v.budget -= inlineExtraThrowCost @@ -309,7 +418,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { break } - if fn := inlCallee(n.X); fn != nil && fn.Inl != nil { + if fn := inlCallee(n.X); isInlinable(fn) { v.budget -= fn.Inl.Cost break } @@ -338,13 +447,19 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { case ir.ORECOVER: // recover matches the argument frame pointer to find // the right panic value, so it needs an argument frame. - v.reason = "call to recover" - return true + if !v.onlyCost { + v.reason = "call to recover" + return true + } case ir.OCLOSURE: if base.Debug.InlFuncsWithClosures == 0 { - v.reason = "not inlining functions with closures" - return true + if !v.onlyCost { + v.reason = "not inlining functions with closures" + return true + } else { + break + } } // TODO(danscales): Maybe make budget proportional to number of closure @@ -355,7 +470,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { // do) to check for disallowed ops in the body and include the // body in the budget. if doList(n.(*ir.ClosureExpr).Func.Body, v.do) { - return true + if !v.onlyCost { + return true + } } case ir.ORANGE, @@ -364,8 +481,10 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { ir.ODEFER, ir.ODCLTYPE, // can't print yet ir.OTAILCALL: - v.reason = "unhandled op " + n.Op().String() - return true + if !v.onlyCost { + v.reason = "unhandled op " + n.Op().String() + return true + } case ir.OAPPEND: v.budget -= inlineExtraAppendCost @@ -493,20 +612,13 @@ func inlcopy(n ir.Node) ir.Node { func InlineCalls(fn *ir.Func) { savefn := ir.CurFunc ir.CurFunc = fn - maxCost := int32(inlineMaxBudget) - if isBigFunc(fn) { - maxCost = inlineBigFunctionMaxCost - } - // Map to keep track of functions that have been inlined at a particular - // call site, in order to stop inlining when we reach the beginning of a - // recursion cycle again. We don't inline immediately recursive functions, - // but allow inlining if there is a recursion cycle of many functions. - // Most likely, the inlining will stop before we even hit the beginning of - // the cycle again, but the map catches the unusual case. - inlMap := make(map[*ir.Func]bool) + + var inlCtx inlContext + inlCtx.Init(fn) + var edit func(ir.Node) ir.Node edit = func(n ir.Node) ir.Node { - return inlnode(n, maxCost, inlMap, edit) + return inlnode(n, &inlCtx, edit) } ir.EditChildren(fn, edit) ir.CurFunc = savefn @@ -525,11 +637,16 @@ func InlineCalls(fn *ir.Func) { // shorter and less complicated. // The result of inlnode MUST be assigned back to n, e.g. // n.Left = inlnode(n.Left) -func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node { +func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node { if n == nil { return n } + if n.Op() == ir.OFOR { + ctx.PushFor(n) + defer ctx.PopFor() + } + switch n.Op() { case ir.ODEFER, ir.OGO: n := n.(*ir.GoDeferStmt) @@ -587,7 +704,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No break } if fn := inlCallee(call.X); fn != nil && fn.Inl != nil { - n = mkinlcall(call, fn, maxCost, inlMap, edit) + n = mkinlcall(call, fn, ctx, edit) } } @@ -660,7 +777,7 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa // parameters. // The result of mkinlcall MUST be assigned back to n, e.g. // n.Left = mkinlcall(n.Left, fn, isddd) -func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node { +func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node { if fn.Inl == nil { if logopt.Enabled() { logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc), @@ -668,12 +785,12 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b } return n } - if fn.Inl.Cost > maxCost { + if fn.Inl.Cost > ctx.InlineBudget() { // The inlined function body is too big. Typically we use this check to restrict // inlining into very big functions. See issue 26546 and 17566. if logopt.Enabled() { logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc), - fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost)) + fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget())) } return n } @@ -696,15 +813,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b return n } - if inlMap[fn] { + if ctx.inlinedCallees[fn] { if base.Flag.LowerM > 1 { fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc)) } return n } - inlMap[fn] = true + ctx.inlinedCallees[fn] = true defer func() { - inlMap[fn] = false + ctx.inlinedCallees[fn] = false }() typecheck.FixVariadicCall(n) diff --git a/test/inline.go b/test/inline.go index 599d5233e0f3f..51a9011c0ca13 100644 --- a/test/inline.go +++ b/test/inline.go @@ -290,3 +290,57 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2" func conv1(v uint64) uint64 { // ERROR "can inline conv1" return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v))))))))))) } + +// Inline into FORs +func func_with_cost_88() { + x := 200 + for i := 0; i < x; i++ { + if i%2 == 0 { + runtime.GC() + } else { + i += 2 + x += 1 + } + } +} + +func func_with_fors() { + func_with_cost_88() + + for i := 0; i < 100; i++ { + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" + } + + func_with_cost_88() + func_with_cost_88() + + for i := 0; i < 100; i++ { + for j := 0; j < 100; j++ { + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" + } + } + + for i := 0; i < 100; i++ { + for j := 0; j < 100; j++ { + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" + } + } + + for i := 0; i < 100; i++ { + for j := 0; j < 100; j++ { + // Calls can't be inlined, since the outher FOR is a "big" one. + func_with_cost_88() + func_with_cost_88() + func_with_cost_88() + func_with_cost_88() + for j := 0; j < 100; j++ { + func_with_cost_88() + func_with_cost_88() + } + } + } + + func_with_cost_88() +} diff --git a/test/inline_for.go b/test/inline_for.go new file mode 100644 index 0000000000000..67778feaf2af0 --- /dev/null +++ b/test/inline_for.go @@ -0,0 +1,39 @@ +// errorcheck -0 -m=2 + +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test, using compiler diagnostic flags, that inlining is working. +// Compiles but does not run. + +package foo + +import "runtime" + +func func_with() int { // ERROR "can inline func_with .*" + return 10 +} + +func func_with_cost_88() { // ERROR "can inline only into small FORs .*" + x := 200 + for i := 0; i < x; i++ { // ERROR "add FOR to stack \[\{39\}\]" + if i%2 == 0 { + runtime.GC() + } else { + i += 2 + x += 1 + } + } +} + +func func_with_fors() { // ERROR "can inline .*" + for { // ERROR "add FOR to stack \[\{22\}\]" + for { // ERROR "add FOR to stack \[\{22\} \{16\}\]" + func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "add FOR to stack \[\{22\} \{16\} \{39\}\]" + } + for { // ERROR "add FOR to stack" + func_with() // ERROR "inlining call to func_with" + } + } +}