Skip to content

Commit

Permalink
cmd/compile: boost inlining into FORs
Browse files Browse the repository at this point in the history
As already Than McIntosh mentioned it's a common practise to boost
inlining to FORs, since the callsite could be "hotter". This patch
implements this functionality.

The implementation uses a stack of FORs to recognise calls which are
in a loop. The stack is maintained alongside inlnode function works
and contains information about ancenstor FORs relative to a current
node in inlnode.

There is "big" FOR which cost is >= inlineBigForCost(105). In such FORs
no boost is applied.

Updates golang#17566

The following results on GO1, while binary size not increased significantly
10454800 -> 10475120, which is less than 0.3%.

goos: linux
goarch: amd64
pkg: test/bench/go1
cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz
name                     old time/op    new time/op    delta
BinaryTree17-8              2.15s ± 1%     2.17s ± 1%     ~     (p=0.065 n=6+6)
Fannkuch11-8                2.70s ± 0%     2.69s ± 0%   -0.25%  (p=0.010 n=6+4)
FmtFprintfEmpty-8          31.9ns ± 0%    31.4ns ± 0%   -1.61%  (p=0.008 n=5+5)
FmtFprintfString-8         57.0ns ± 0%    57.1ns ± 0%   +0.26%  (p=0.013 n=6+5)
FmtFprintfInt-8            65.2ns ± 0%    63.9ns ± 0%   -1.95%  (p=0.008 n=5+5)
FmtFprintfIntInt-8          103ns ± 0%     102ns ± 0%   -1.01%  (p=0.000 n=5+4)
FmtFprintfPrefixedInt-8     119ns ± 0%     118ns ± 0%   -0.50%  (p=0.008 n=5+5)
FmtFprintfFloat-8           169ns ± 0%     174ns ± 0%   +2.75%  (p=0.008 n=5+5)
FmtManyArgs-8               445ns ± 0%     447ns ± 0%   +0.46%  (p=0.002 n=6+6)
GobDecode-8                4.37ms ± 1%    4.40ms ± 0%   +0.62%  (p=0.009 n=6+6)
GobEncode-8                3.07ms ± 0%    3.04ms ± 0%   -0.78%  (p=0.004 n=5+6)
Gzip-8                      195ms ± 0%     195ms ± 0%     ~     (p=0.429 n=5+6)
Gunzip-8                   28.2ms ± 0%    28.2ms ± 0%     ~     (p=0.662 n=5+6)
HTTPClientServer-8         45.0µs ± 1%    45.4µs ± 1%     ~     (p=0.093 n=6+6)
JSONEncode-8               8.01ms ± 0%    8.03ms ± 0%   +0.31%  (p=0.008 n=5+5)
JSONDecode-8               35.3ms ± 1%    35.1ms ± 0%   -0.72%  (p=0.008 n=5+5)
Mandelbrot200-8            4.50ms ± 0%    4.49ms ± 1%     ~     (p=0.937 n=6+6)
GoParse-8                  3.03ms ± 1%    3.00ms ± 1%     ~     (p=0.180 n=6+6)
RegexpMatchEasy0_32-8      55.4ns ± 0%    53.2ns ± 3%   -3.92%  (p=0.004 n=5+6)
RegexpMatchEasy0_1K-8       178ns ± 0%     175ns ± 1%   -1.57%  (p=0.004 n=5+6)
RegexpMatchEasy1_32-8      50.1ns ± 0%    48.3ns ± 5%     ~     (p=0.082 n=5+6)
RegexpMatchEasy1_1K-8       271ns ± 1%     262ns ± 1%   -3.26%  (p=0.004 n=6+5)
RegexpMatchMedium_32-8      949ns ± 0%     886ns ± 7%     ~     (p=0.329 n=5+6)
RegexpMatchMedium_1K-8     27.1µs ± 7%    28.1µs ± 6%     ~     (p=0.394 n=6+6)
RegexpMatchHard_32-8       1.28µs ± 2%    1.29µs ± 0%     ~     (p=0.056 n=6+6)
RegexpMatchHard_1K-8       38.5µs ± 0%    38.4µs ± 0%   -0.25%  (p=0.009 n=6+5)
Revcomp-8                   397ms ± 0%     396ms ± 0%     ~     (p=0.429 n=6+5)
Template-8                 48.1ms ± 1%    48.1ms ± 0%     ~     (p=0.222 n=5+5)
TimeParse-8                 213ns ± 0%     213ns ± 0%     ~     (p=0.210 n=4+6)
TimeFormat-8                295ns ± 1%     259ns ± 0%  -12.22%  (p=0.002 n=6+6)
[Geo mean]                 40.5µs         40.1µs        -1.00%

name                     old speed      new speed      delta
GobDecode-8               176MB/s ± 1%   174MB/s ± 0%   -0.61%  (p=0.009 n=6+6)
GobEncode-8               250MB/s ± 0%   252MB/s ± 0%   +0.79%  (p=0.004 n=5+6)
Gzip-8                    100MB/s ± 0%   100MB/s ± 0%     ~     (p=0.351 n=5+6)
Gunzip-8                  687MB/s ± 0%   687MB/s ± 0%     ~     (p=0.662 n=5+6)
JSONEncode-8              242MB/s ± 0%   242MB/s ± 0%   -0.31%  (p=0.008 n=5+5)
JSONDecode-8             54.9MB/s ± 1%  55.3MB/s ± 0%   +0.71%  (p=0.008 n=5+5)
GoParse-8                19.1MB/s ± 1%  19.3MB/s ± 1%     ~     (p=0.143 n=6+6)
RegexpMatchEasy0_32-8     578MB/s ± 0%   601MB/s ± 3%   +4.10%  (p=0.004 n=5+6)
RegexpMatchEasy0_1K-8    5.74GB/s ± 1%  5.85GB/s ± 1%   +1.90%  (p=0.002 n=6+6)
RegexpMatchEasy1_32-8     639MB/s ± 0%   663MB/s ± 4%     ~     (p=0.082 n=5+6)
RegexpMatchEasy1_1K-8    3.78GB/s ± 1%  3.91GB/s ± 1%   +3.38%  (p=0.004 n=6+5)
RegexpMatchMedium_32-8   33.7MB/s ± 0%  36.2MB/s ± 7%     ~     (p=0.268 n=5+6)
RegexpMatchMedium_1K-8   37.9MB/s ± 6%  36.5MB/s ± 6%     ~     (p=0.411 n=6+6)
RegexpMatchHard_32-8     24.9MB/s ± 2%  24.8MB/s ± 0%     ~     (p=0.063 n=6+6)
RegexpMatchHard_1K-8     26.6MB/s ± 0%  26.7MB/s ± 0%   +0.25%  (p=0.009 n=6+5)
Revcomp-8                 640MB/s ± 0%   641MB/s ± 0%     ~     (p=0.429 n=6+5)
Template-8               40.4MB/s ± 1%  40.3MB/s ± 0%     ~     (p=0.222 n=5+5)
[Geo mean]                175MB/s        177MB/s        +1.05%
  • Loading branch information
nimelehin committed Oct 19, 2021
1 parent bde0463 commit d68cada
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 45 deletions.
207 changes: 162 additions & 45 deletions src/cmd/compile/internal/inline/inl.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,107 @@ const (
inlineMaxBudget = 80
inlineExtraAppendCost = 0
// default is to inline if there's at most one call. -l=4 overrides this by using 1 instead.
inlineExtraCallCost = 57 // 57 was benchmarked to provided most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
inlineExtraCallCost = 57 // 57 was benchmarked to provide most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
inlineExtraPanicCost = 1 // do not penalize inlining panics.
inlineExtraThrowCost = inlineMaxBudget // with current (2018-05/1.11) code, inlining runtime.throw does not help.

inlineBigFunctionNodes = 5000 // Functions with this many nodes are considered "big".
inlineBigFunctionMaxCost = 20 // Max cost of inlinee when inlining into a "big" function.

// These values were benchmarked to provide most benefit with no bad surprises.
inlineBigForCost = 105 // FORs with at least this cost are considered "big".
inlineIntoForExtraCallCost = 14
inlineIntoForExtraBudget = 18 // Extra budget when inlining into FORs which are not "big".

// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
)

// isInlinable checks if the function can be inlined in a 'typical' scenario
// when no boosts are applied.
func isInlinable(fn *ir.Func) bool {
return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
}

type forContext struct {
cost int32 // Helps to determine if FOR is a "big" one.
}

type inlContext struct {
// Map to keep track of functions that have been inlined at a particular
// call site, in order to stop inlining when we reach the beginning of a
// recursion cycle again. We don't inline immediately recursive functions,
// but allow inlining if there is a recursion cycle of many functions.
// Most likely, the inlining will stop before we even hit the beginning of
// the cycle again, but the map catches the unusual case.
inlinedCallees map[*ir.Func]bool

// Stack to recognise which call nodes are located inside fors, while doing inlnode.
forsStack []forContext
initialInlineBudget int32 // Initial inline budget. Boosts are calculated related to this.
}

// Current decision is made on whether all FORs in current scope are not "big".
func (ctx inlContext) canBoostInliningIntoFor() bool {
for i := 0; i < len(ctx.forsStack); i++ {
if ctx.forsStack[i].cost >= inlineBigForCost {
return false
}
}
return len(ctx.forsStack) > 0
}

func (ctx *inlContext) Init(fn *ir.Func) {
ctx.inlinedCallees = make(map[*ir.Func]bool)

if isBigFunc(fn) {
ctx.initialInlineBudget = inlineBigFunctionMaxCost
} else {
ctx.initialInlineBudget = inlineMaxBudget
}
}

func (ctx *inlContext) PushFor(n ir.Node) {
ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})

if base.Flag.LowerM > 1 {
fmt.Printf("%v: add FOR to stack %v\n", ir.Line(n), ctx.forsStack)
}
}

func (ctx *inlContext) PopFor() {
ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
}

func (ctx inlContext) InlineBudget() int32 {
finalBudget := ctx.initialInlineBudget
if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
// Boosts only regular functions
finalBudget += inlineIntoForExtraBudget
}

return finalBudget
}

// forCost calculates the cost of FORs. It is used to determine if functions
// will be boosted to inline into the FOR.
// We don't want to boost inlining into "big" FORs to keep their body
// in the instruction cache.
func forCost(n ir.Node) int32 {
exceededCostReason := func(remainingBudget int32) string {
return fmt.Sprintf("FOR is big: cost %d exceeds maximum cost %d", inlineBigForCost-remainingBudget, inlineBigForCost)
}

visitor := hairyVisitor{
budget: inlineBigForCost,
extraCallCost: inlineIntoForExtraCallCost,
onlyCost: true,
exceededCostReasonCallback: exceededCostReason,
}
visitor.tooHairy(n)
return inlineBigForCost - visitor.budget
}

// InlinePackage finds functions that can be inlined and clones them before walk expands them.
func InlinePackage() {
ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
Expand Down Expand Up @@ -166,30 +259,40 @@ func CanInline(fn *ir.Func) {
// locals, and we use this map to produce a pruned Inline.Dcl
// list. See issue 25249 for more context.

exceededCostReason := func(remainingBudget int32) string {
return fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-remainingBudget, inlineVisitorBudget)
}

visitor := hairyVisitor{
budget: inlineMaxBudget,
extraCallCost: cc,
budget: inlineVisitorBudget,
extraCallCost: cc,
onlyCost: false,
exceededCostReasonCallback: exceededCostReason,
}
if visitor.tooHairy(fn) {
reason = visitor.reason
return
}

n.Func.Inl = &ir.Inline{
Cost: inlineMaxBudget - visitor.budget,
Cost: inlineVisitorBudget - visitor.budget,
Dcl: pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
Body: inlcopylist(fn.Body),

CanDelayResults: canDelayResults(fn),
}

if base.Flag.LowerM > 1 {
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
} else if base.Flag.LowerM != 0 {
if isInlinable(n.Func) {
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
} else {
fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
}
} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
}
if logopt.Enabled() {
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
}
}

Expand Down Expand Up @@ -228,20 +331,22 @@ func canDelayResults(fn *ir.Func) bool {
// hairyVisitor visits a function body to determine its inlining
// hairiness and whether or not it can be inlined.
type hairyVisitor struct {
budget int32
reason string
extraCallCost int32
usedLocals ir.NameSet
do func(ir.Node) bool
budget int32
extraCallCost int32
onlyCost bool // If set, tooHairy does NOT check inlinible nodes, only cost.
reason string
usedLocals ir.NameSet
do func(ir.Node) bool
exceededCostReasonCallback func(remainingBudget int32) string
}

func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
func (v *hairyVisitor) tooHairy(n ir.Node) bool {
v.do = v.doNode // cache closure
if ir.DoChildren(fn, v.do) {
if ir.DoChildren(n, v.do) {
return true
}
if v.budget < 0 {
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
v.reason = v.exceededCostReasonCallback(v.budget)
return true
}
return false
Expand All @@ -264,8 +369,12 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
if name.Class == ir.PFUNC && types.IsRuntimePkg(name.Sym().Pkg) {
fn := name.Sym().Name
if fn == "getcallerpc" || fn == "getcallersp" {
v.reason = "call to " + fn
return true
if !v.onlyCost {
v.reason = "call to " + fn
return true
} else {
break
}
}
if fn == "throw" {
v.budget -= inlineExtraThrowCost
Expand Down Expand Up @@ -309,7 +418,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
break
}

if fn := inlCallee(n.X); fn != nil && fn.Inl != nil {
if fn := inlCallee(n.X); isInlinable(fn) {
v.budget -= fn.Inl.Cost
break
}
Expand Down Expand Up @@ -338,13 +447,19 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
case ir.ORECOVER:
// recover matches the argument frame pointer to find
// the right panic value, so it needs an argument frame.
v.reason = "call to recover"
return true
if !v.onlyCost {
v.reason = "call to recover"
return true
}

case ir.OCLOSURE:
if base.Debug.InlFuncsWithClosures == 0 {
v.reason = "not inlining functions with closures"
return true
if !v.onlyCost {
v.reason = "not inlining functions with closures"
return true
} else {
break
}
}

// TODO(danscales): Maybe make budget proportional to number of closure
Expand All @@ -355,7 +470,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
// do) to check for disallowed ops in the body and include the
// body in the budget.
if doList(n.(*ir.ClosureExpr).Func.Body, v.do) {
return true
if !v.onlyCost {
return true
}
}

case ir.ORANGE,
Expand All @@ -364,8 +481,10 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
ir.ODEFER,
ir.ODCLTYPE, // can't print yet
ir.OTAILCALL:
v.reason = "unhandled op " + n.Op().String()
return true
if !v.onlyCost {
v.reason = "unhandled op " + n.Op().String()
return true
}

case ir.OAPPEND:
v.budget -= inlineExtraAppendCost
Expand Down Expand Up @@ -493,20 +612,13 @@ func inlcopy(n ir.Node) ir.Node {
func InlineCalls(fn *ir.Func) {
savefn := ir.CurFunc
ir.CurFunc = fn
maxCost := int32(inlineMaxBudget)
if isBigFunc(fn) {
maxCost = inlineBigFunctionMaxCost
}
// Map to keep track of functions that have been inlined at a particular
// call site, in order to stop inlining when we reach the beginning of a
// recursion cycle again. We don't inline immediately recursive functions,
// but allow inlining if there is a recursion cycle of many functions.
// Most likely, the inlining will stop before we even hit the beginning of
// the cycle again, but the map catches the unusual case.
inlMap := make(map[*ir.Func]bool)

var inlCtx inlContext
inlCtx.Init(fn)

var edit func(ir.Node) ir.Node
edit = func(n ir.Node) ir.Node {
return inlnode(n, maxCost, inlMap, edit)
return inlnode(n, &inlCtx, edit)
}
ir.EditChildren(fn, edit)
ir.CurFunc = savefn
Expand All @@ -525,11 +637,16 @@ func InlineCalls(fn *ir.Func) {
// shorter and less complicated.
// The result of inlnode MUST be assigned back to n, e.g.
// n.Left = inlnode(n.Left)
func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
if n == nil {
return n
}

if n.Op() == ir.OFOR {
ctx.PushFor(n)
defer ctx.PopFor()
}

switch n.Op() {
case ir.ODEFER, ir.OGO:
n := n.(*ir.GoDeferStmt)
Expand Down Expand Up @@ -587,7 +704,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
break
}
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
n = mkinlcall(call, fn, maxCost, inlMap, edit)
n = mkinlcall(call, fn, ctx, edit)
}
}

Expand Down Expand Up @@ -660,20 +777,20 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
// parameters.
// The result of mkinlcall MUST be assigned back to n, e.g.
// n.Left = mkinlcall(n.Left, fn, isddd)
func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
if fn.Inl == nil {
if logopt.Enabled() {
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(fn)))
}
return n
}
if fn.Inl.Cost > maxCost {
if fn.Inl.Cost > ctx.InlineBudget() {
// The inlined function body is too big. Typically we use this check to restrict
// inlining into very big functions. See issue 26546 and 17566.
if logopt.Enabled() {
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
}
return n
}
Expand All @@ -696,15 +813,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
return n
}

if inlMap[fn] {
if ctx.inlinedCallees[fn] {
if base.Flag.LowerM > 1 {
fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
}
return n
}
inlMap[fn] = true
ctx.inlinedCallees[fn] = true
defer func() {
inlMap[fn] = false
ctx.inlinedCallees[fn] = false
}()

typecheck.FixVariadicCall(n)
Expand Down
Loading

0 comments on commit d68cada

Please sign in to comment.