cmd/compile: boost inlining into FORs

As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. There is "big" FOR which cost is >= inlineBigForCost(47). In such FORs no boost is applied. Updates golang#17566 The following results on GO1, while binary size not increased significantly 10441232 -> 10465920, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.17s ± 1% +0.86% (p=0.041 n=6+6) Fannkuch11-8 2.70s ± 0% 2.72s ± 0% +0.71% (p=0.002 n=6+6) FmtFprintfEmpty-8 31.9ns ± 0% 31.6ns ± 0% -1.06% (p=0.008 n=5+5) FmtFprintfString-8 57.0ns ± 0% 58.3ns ± 0% +2.26% (p=0.004 n=6+5) FmtFprintfInt-8 65.2ns ± 0% 64.1ns ± 0% -1.65% (p=0.000 n=5+4) FmtFprintfIntInt-8 103ns ± 0% 102ns ± 0% -0.91% (p=0.000 n=5+6) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.60% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 171ns ± 0% +1.50% (p=0.004 n=5+6) FmtManyArgs-8 445ns ± 0% 445ns ± 0% ~ (p=0.506 n=6+5) GobDecode-8 4.37ms ± 1% 4.41ms ± 0% +0.79% (p=0.009 n=6+6) GobEncode-8 3.07ms ± 0% 3.05ms ± 0% -0.42% (p=0.004 n=5+6) Gzip-8 195ms ± 0% 194ms ± 0% -0.40% (p=0.009 n=5+6) Gunzip-8 28.2ms ± 0% 28.9ms ± 0% +2.22% (p=0.004 n=5+6) HTTPClientServer-8 45.0µs ± 1% 45.4µs ± 0% +0.97% (p=0.030 n=6+5) JSONEncode-8 8.01ms ± 0% 7.95ms ± 0% -0.78% (p=0.008 n=5+5) JSONDecode-8 35.3ms ± 1% 35.0ms ± 0% -1.04% (p=0.004 n=5+6) Mandelbrot200-8 4.50ms ± 0% 4.50ms ± 0% ~ (p=0.662 n=6+5) GoParse-8 3.03ms ± 1% 2.96ms ± 0% -2.41% (p=0.004 n=6+5) RegexpMatchEasy0_32-8 55.4ns ± 0% 53.8ns ± 0% -2.83% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 178ns ± 0% 162ns ± 1% -8.76% (p=0.004 n=5+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 49.6ns ± 0% -0.92% (p=0.004 n=5+6) RegexpMatchEasy1_1K-8 271ns ± 1% 268ns ± 0% -1.15% (p=0.002 n=6+6) RegexpMatchMedium_32-8 949ns ± 0% 862ns ± 0% -9.20% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 27.1µs ± 7% 27.4µs ± 7% ~ (p=0.589 n=6+6) RegexpMatchHard_32-8 1.28µs ± 2% 1.27µs ± 1% ~ (p=0.065 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.5µs ± 0% ~ (p=0.132 n=6+6) Revcomp-8 397ms ± 0% 397ms ± 0% ~ (p=1.000 n=6+6) Template-8 48.1ms ± 1% 47.8ms ± 0% -0.48% (p=0.016 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.467 n=4+6) TimeFormat-8 295ns ± 1% 294ns ± 0% ~ (p=0.554 n=6+5) [Geo mean] 40.5µs 40.2µs -0.81% name old speed new speed delta GobDecode-8 176MB/s ± 1% 174MB/s ± 0% -0.79% (p=0.009 n=6+6) GobEncode-8 250MB/s ± 0% 251MB/s ± 0% +0.42% (p=0.004 n=5+6) Gzip-8 100MB/s ± 0% 100MB/s ± 0% +0.40% (p=0.009 n=5+6) Gunzip-8 687MB/s ± 0% 672MB/s ± 0% -2.17% (p=0.004 n=5+6) JSONEncode-8 242MB/s ± 0% 244MB/s ± 0% +0.78% (p=0.008 n=5+5) JSONDecode-8 54.9MB/s ± 1% 55.5MB/s ± 0% +1.05% (p=0.004 n=5+6) GoParse-8 19.1MB/s ± 1% 19.6MB/s ± 0% +2.48% (p=0.004 n=6+5) RegexpMatchEasy0_32-8 578MB/s ± 0% 594MB/s ± 0% +2.89% (p=0.008 n=5+5) RegexpMatchEasy0_1K-8 5.74GB/s ± 1% 6.31GB/s ± 1% +9.95% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 645MB/s ± 0% +0.93% (p=0.004 n=5+6) RegexpMatchEasy1_1K-8 3.78GB/s ± 1% 3.82GB/s ± 0% +1.15% (p=0.002 n=6+6) RegexpMatchMedium_32-8 33.7MB/s ± 0% 37.1MB/s ± 0% +10.15% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 37.9MB/s ± 6% 37.5MB/s ± 7% ~ (p=0.697 n=6+6) RegexpMatchHard_32-8 24.9MB/s ± 2% 25.1MB/s ± 1% ~ (p=0.058 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.6MB/s ± 0% ~ (p=0.195 n=6+6) Revcomp-8 640MB/s ± 0% 641MB/s ± 0% ~ (p=1.000 n=6+6) Template-8 40.4MB/s ± 1% 40.6MB/s ± 0% +0.47% (p=0.016 n=5+5) [Geo mean] 175MB/s 178MB/s +1.56%
nimelehin · Sep 10, 2021 · d7f3135 · d7f3135
1 parent ab7c904
commit d7f3135
Show file tree

Hide file tree

Showing 3 changed files with 203 additions and 26 deletions.
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
@@ -51,8 +51,103 @@ const (
 
 	inlineBigFunctionNodes   = 5000 // Functions with this many nodes are considered "big".
 	inlineBigFunctionMaxCost = 20   // Max cost of inlinee when inlining into a "big" function.
+
+	inlineBigForCost                    = 51 // FORs with at least this cost are considered "big".
+	inlineForMaxCost                    = 37 // FORs should be cheaper than this to boost inlining into themselves.
+	inlineIntoForExtraCallCost          = 6  // These extra costs were benchmarked to provided most benefit with no bad surprises.
+	inlineIntoForExtraInlinableCallCost = 10
+	inlineIntoForExtraBudget            = 16 // Extra budget when inlining into FORs which are not "big".
+
+	// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
+	inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
 )
 
+// isInlinable checks if the function can be inlined in a 'typical' scenario
+// when no boosts are applied.
+func isInlinable(fn *ir.Func) bool {
+	return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
+}
+
+type forContext struct {
+	cost int32
+}
+
+type inlContext struct {
+	// Map to keep track of functions that have been inlined at a particular
+	// call site, in order to stop inlining when we reach the beginning of a
+	// recursion cycle again. We don't inline immediately recursive functions,
+	// but allow inlining if there is a recursion cycle of many functions.
+	// Most likely, the inlining will stop before we even hit the beginning of
+	// the cycle again, but the map catches the unusual case.
+	inlinedCallees map[*ir.Func]bool
+
+	// Stack to recognise which call nodes are located inside fors, while doing inlnode.
+	forsStack           []forContext
+	initialInlineBudget int32 // Initial inline budget, boosts are calculated related to this.
+}
+
+func (ctx inlContext) canBoostInliningIntoFor() bool {
+	// The decision is based on:
+	//   1) The first FOR in the stack is not "big".
+	//   2) The last FOR cost should be less inlineForMaxCost.
+	return len(ctx.forsStack) > 0 && ctx.forsStack[0].cost < inlineBigForCost && ctx.forsStack[len(ctx.forsStack)-1].cost < inlineForMaxCost
+}
+
+func (ctx *inlContext) Init(fn *ir.Func) {
+	ctx.inlinedCallees = make(map[*ir.Func]bool)
+
+	if isBigFunc(fn) {
+		ctx.initialInlineBudget = inlineBigFunctionMaxCost
+	} else {
+		ctx.initialInlineBudget = inlineMaxBudget
+	}
+}
+
+func (ctx *inlContext) PushFor(n ir.Node) {
+	ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})
+
+	if base.Flag.LowerM > 1 {
+		fmt.Printf("%v: add for to stack %v\n", ir.Line(n), ctx.forsStack)
+	}
+}
+
+func (ctx *inlContext) PopFor() {
+	ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
+}
+
+func (ctx inlContext) InlineBudget() int32 {
+	finalBudget := ctx.initialInlineBudget
+	if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
+		// Boosts only regular functions
+		finalBudget += inlineIntoForExtraBudget
+	}
+
+	return finalBudget
+}
+
+func forCost(n ir.Node) int32 {
+	cost := int32(inlineBigForCost)
+	ir.Any(n, func(n ir.Node) bool {
+		cost--
+
+		switch n.Op() {
+		case ir.OCALLFUNC:
+			call := n.(*ir.CallExpr)
+			if ir.IsIntrinsicCall(call) {
+				// Treat like any other node.
+				break
+			}
+
+			cost -= inlineIntoForExtraCallCost
+			if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
+				cost -= inlineIntoForExtraInlinableCallCost
+			}
+		}
+		return cost < 0
+	})
+	return inlineBigForCost - cost
+}
+
 // InlinePackage finds functions that can be inlined and clones them before walk expands them.
 func InlinePackage() {
 	ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,7 +262,7 @@ func CanInline(fn *ir.Func) {
 	// list. See issue 25249 for more context.
 
 	visitor := hairyVisitor{
-		budget:        inlineMaxBudget,
+		budget:        inlineVisitorBudget,
 		extraCallCost: cc,
 	}
 	if visitor.tooHairy(fn) {
@@ -176,20 +271,24 @@ func CanInline(fn *ir.Func) {
 	}
 
 	n.Func.Inl = &ir.Inline{
-		Cost: inlineMaxBudget - visitor.budget,
+		Cost: inlineVisitorBudget - visitor.budget,
 		Dcl:  pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
 		Body: inlcopylist(fn.Body),
 
 		CanDelayResults: canDelayResults(fn),
 	}
 
 	if base.Flag.LowerM > 1 {
-		fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
-	} else if base.Flag.LowerM != 0 {
+		if isInlinable(n.Func) {
+			fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		} else {
+			fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		}
+	} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
 		fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
 	}
 	if logopt.Enabled() {
-		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
+		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
 	}
 }
 
@@ -241,7 +340,7 @@ func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
 		return true
 	}
 	if v.budget < 0 {
-		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
+		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
 		return true
 	}
 	return false
@@ -493,20 +592,13 @@ func inlcopy(n ir.Node) ir.Node {
 func InlineCalls(fn *ir.Func) {
 	savefn := ir.CurFunc
 	ir.CurFunc = fn
-	maxCost := int32(inlineMaxBudget)
-	if isBigFunc(fn) {
-		maxCost = inlineBigFunctionMaxCost
-	}
-	// Map to keep track of functions that have been inlined at a particular
-	// call site, in order to stop inlining when we reach the beginning of a
-	// recursion cycle again. We don't inline immediately recursive functions,
-	// but allow inlining if there is a recursion cycle of many functions.
-	// Most likely, the inlining will stop before we even hit the beginning of
-	// the cycle again, but the map catches the unusual case.
-	inlMap := make(map[*ir.Func]bool)
+
+	var inlCtx inlContext
+	inlCtx.Init(fn)
+
 	var edit func(ir.Node) ir.Node
 	edit = func(n ir.Node) ir.Node {
-		return inlnode(n, maxCost, inlMap, edit)
+		return inlnode(n, &inlCtx, edit)
 	}
 	ir.EditChildren(fn, edit)
 	ir.CurFunc = savefn
@@ -525,11 +617,16 @@ func InlineCalls(fn *ir.Func) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if n == nil {
 		return n
 	}
 
+	if n.Op() == ir.OFOR {
+		ctx.PushFor(n)
+		defer ctx.PopFor()
+	}
+
 	switch n.Op() {
 	case ir.ODEFER, ir.OGO:
 		n := n.(*ir.GoDeferStmt)
@@ -584,7 +681,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 			break
 		}
 		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
-			n = mkinlcall(call, fn, maxCost, inlMap, edit)
+			n = mkinlcall(call, fn, ctx, edit)
 		}
 	}
 
@@ -657,20 +754,20 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if fn.Inl == nil {
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
 				fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(fn)))
 		}
 		return n
 	}
-	if fn.Inl.Cost > maxCost {
+	if fn.Inl.Cost > ctx.InlineBudget() {
 		// The inlined function body is too big. Typically we use this check to restrict
 		// inlining into very big functions.  See issue 26546 and 17566.
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
-				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
+				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
 		}
 		return n
 	}
@@ -693,15 +790,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		return n
 	}
 
-	if inlMap[fn] {
+	if ctx.inlinedCallees[fn] {
 		if base.Flag.LowerM > 1 {
 			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
 		}
 		return n
 	}
-	inlMap[fn] = true
+	ctx.inlinedCallees[fn] = true
 	defer func() {
-		inlMap[fn] = false
+		ctx.inlinedCallees[fn] = false
 	}()
 
 	typecheck.FixVariadicCall(n)

diff --git a/test/inline.go b/test/inline.go
@@ -292,3 +292,44 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
 func conv1(v uint64) uint64 { // ERROR "can inline conv1"
 	return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
 }
+
+// Inline into FORs
+func func_with_cost_88() {
+	x := 200
+	for i := 0; i < x; i++ {
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() {
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+	}
+
+	func_with_cost_88()
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+	}
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			// All this calls can't be inline, since FOR is too big.
+			func_with_cost_88()
+			func_with_cost_88()
+			func_with_cost_88()
+		}
+	}
+
+	func_with_cost_88()
+}
diff --git a/test/inline_for.go b/test/inline_for.go
@@ -0,0 +1,39 @@
+// errorcheck -0 -m=2
+
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test, using compiler diagnostic flags, that inlining is working.
+// Compiles but does not run.
+
+package foo
+
+import "runtime"
+
+func func_with() int { // ERROR "can inline func_with .*"
+	return 10
+}
+
+func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
+	x := 200
+	for i := 0; i < x; i++ { // ERROR "add for to stack \[\{32\}\]"
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() { // ERROR "cannot inline .*"
+	for { // ERROR "add for to stack \[\{39\}\]"
+		for { // ERROR "add for to stack \[\{39\} \{19\}\]"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "add for to stack \[\{39\} \{19\} \{32\}\]"
+		}
+		for { // ERROR "add for to stack"
+			func_with() // ERROR "inlining call to func_with"
+		}
+	}
+}