cmd/compile: boost inlining into FORs

As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. There is "big" FOR which cost is >= inlineBigForCost(105). In such FORs no boost is applied. Updates golang#17566 The following results on GO1, while binary size not increased significantly 10454800 -> 10475120, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.17s ± 1% ~ (p=0.065 n=6+6) Fannkuch11-8 2.70s ± 0% 2.69s ± 0% -0.25% (p=0.010 n=6+4) FmtFprintfEmpty-8 31.9ns ± 0% 31.4ns ± 0% -1.61% (p=0.008 n=5+5) FmtFprintfString-8 57.0ns ± 0% 57.1ns ± 0% +0.26% (p=0.013 n=6+5) FmtFprintfInt-8 65.2ns ± 0% 63.9ns ± 0% -1.95% (p=0.008 n=5+5) FmtFprintfIntInt-8 103ns ± 0% 102ns ± 0% -1.01% (p=0.000 n=5+4) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.50% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 174ns ± 0% +2.75% (p=0.008 n=5+5) FmtManyArgs-8 445ns ± 0% 447ns ± 0% +0.46% (p=0.002 n=6+6) GobDecode-8 4.37ms ± 1% 4.40ms ± 0% +0.62% (p=0.009 n=6+6) GobEncode-8 3.07ms ± 0% 3.04ms ± 0% -0.78% (p=0.004 n=5+6) Gzip-8 195ms ± 0% 195ms ± 0% ~ (p=0.429 n=5+6) Gunzip-8 28.2ms ± 0% 28.2ms ± 0% ~ (p=0.662 n=5+6) HTTPClientServer-8 45.0µs ± 1% 45.4µs ± 1% ~ (p=0.093 n=6+6) JSONEncode-8 8.01ms ± 0% 8.03ms ± 0% +0.31% (p=0.008 n=5+5) JSONDecode-8 35.3ms ± 1% 35.1ms ± 0% -0.72% (p=0.008 n=5+5) Mandelbrot200-8 4.50ms ± 0% 4.49ms ± 1% ~ (p=0.937 n=6+6) GoParse-8 3.03ms ± 1% 3.00ms ± 1% ~ (p=0.180 n=6+6) RegexpMatchEasy0_32-8 55.4ns ± 0% 53.2ns ± 3% -3.92% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 178ns ± 0% 175ns ± 1% -1.57% (p=0.004 n=5+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 48.3ns ± 5% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 271ns ± 1% 262ns ± 1% -3.26% (p=0.004 n=6+5) RegexpMatchMedium_32-8 949ns ± 0% 886ns ± 7% ~ (p=0.329 n=5+6) RegexpMatchMedium_1K-8 27.1µs ± 7% 28.1µs ± 6% ~ (p=0.394 n=6+6) RegexpMatchHard_32-8 1.28µs ± 2% 1.29µs ± 0% ~ (p=0.056 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.4µs ± 0% -0.25% (p=0.009 n=6+5) Revcomp-8 397ms ± 0% 396ms ± 0% ~ (p=0.429 n=6+5) Template-8 48.1ms ± 1% 48.1ms ± 0% ~ (p=0.222 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.210 n=4+6) TimeFormat-8 295ns ± 1% 259ns ± 0% -12.22% (p=0.002 n=6+6) [Geo mean] 40.5µs 40.1µs -1.00% name old speed new speed delta GobDecode-8 176MB/s ± 1% 174MB/s ± 0% -0.61% (p=0.009 n=6+6) GobEncode-8 250MB/s ± 0% 252MB/s ± 0% +0.79% (p=0.004 n=5+6) Gzip-8 100MB/s ± 0% 100MB/s ± 0% ~ (p=0.351 n=5+6) Gunzip-8 687MB/s ± 0% 687MB/s ± 0% ~ (p=0.662 n=5+6) JSONEncode-8 242MB/s ± 0% 242MB/s ± 0% -0.31% (p=0.008 n=5+5) JSONDecode-8 54.9MB/s ± 1% 55.3MB/s ± 0% +0.71% (p=0.008 n=5+5) GoParse-8 19.1MB/s ± 1% 19.3MB/s ± 1% ~ (p=0.143 n=6+6) RegexpMatchEasy0_32-8 578MB/s ± 0% 601MB/s ± 3% +4.10% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 5.74GB/s ± 1% 5.85GB/s ± 1% +1.90% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 663MB/s ± 4% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 3.78GB/s ± 1% 3.91GB/s ± 1% +3.38% (p=0.004 n=6+5) RegexpMatchMedium_32-8 33.7MB/s ± 0% 36.2MB/s ± 7% ~ (p=0.268 n=5+6) RegexpMatchMedium_1K-8 37.9MB/s ± 6% 36.5MB/s ± 6% ~ (p=0.411 n=6+6) RegexpMatchHard_32-8 24.9MB/s ± 2% 24.8MB/s ± 0% ~ (p=0.063 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.7MB/s ± 0% +0.25% (p=0.009 n=6+5) Revcomp-8 640MB/s ± 0% 641MB/s ± 0% ~ (p=0.429 n=6+5) Template-8 40.4MB/s ± 1% 40.3MB/s ± 0% ~ (p=0.222 n=5+5) [Geo mean] 175MB/s 177MB/s +1.05%
nimelehin · Oct 19, 2021 · d68cada · d68cada
1 parent bde0463
commit d68cada
Show file tree

Hide file tree

Showing 3 changed files with 255 additions and 45 deletions.
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
@@ -45,14 +45,107 @@ const (
 	inlineMaxBudget       = 80
 	inlineExtraAppendCost = 0
 	// default is to inline if there's at most one call. -l=4 overrides this by using 1 instead.
-	inlineExtraCallCost  = 57              // 57 was benchmarked to provided most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
+	inlineExtraCallCost  = 57              // 57 was benchmarked to provide most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
 	inlineExtraPanicCost = 1               // do not penalize inlining panics.
 	inlineExtraThrowCost = inlineMaxBudget // with current (2018-05/1.11) code, inlining runtime.throw does not help.
 
 	inlineBigFunctionNodes   = 5000 // Functions with this many nodes are considered "big".
 	inlineBigFunctionMaxCost = 20   // Max cost of inlinee when inlining into a "big" function.
+
+	// These values were benchmarked to provide most benefit with no bad surprises.
+	inlineBigForCost           = 105 // FORs with at least this cost are considered "big".
+	inlineIntoForExtraCallCost = 14
+	inlineIntoForExtraBudget   = 18 // Extra budget when inlining into FORs which are not "big".
+
+	// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
+	inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
 )
 
+// isInlinable checks if the function can be inlined in a 'typical' scenario
+// when no boosts are applied.
+func isInlinable(fn *ir.Func) bool {
+	return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
+}
+
+type forContext struct {
+	cost int32 // Helps to determine if FOR is a "big" one.
+}
+
+type inlContext struct {
+	// Map to keep track of functions that have been inlined at a particular
+	// call site, in order to stop inlining when we reach the beginning of a
+	// recursion cycle again. We don't inline immediately recursive functions,
+	// but allow inlining if there is a recursion cycle of many functions.
+	// Most likely, the inlining will stop before we even hit the beginning of
+	// the cycle again, but the map catches the unusual case.
+	inlinedCallees map[*ir.Func]bool
+
+	// Stack to recognise which call nodes are located inside fors, while doing inlnode.
+	forsStack           []forContext
+	initialInlineBudget int32 // Initial inline budget. Boosts are calculated related to this.
+}
+
+// Current decision is made on whether all FORs in current scope are not "big".
+func (ctx inlContext) canBoostInliningIntoFor() bool {
+	for i := 0; i < len(ctx.forsStack); i++ {
+		if ctx.forsStack[i].cost >= inlineBigForCost {
+			return false
+		}
+	}
+	return len(ctx.forsStack) > 0
+}
+
+func (ctx *inlContext) Init(fn *ir.Func) {
+	ctx.inlinedCallees = make(map[*ir.Func]bool)
+
+	if isBigFunc(fn) {
+		ctx.initialInlineBudget = inlineBigFunctionMaxCost
+	} else {
+		ctx.initialInlineBudget = inlineMaxBudget
+	}
+}
+
+func (ctx *inlContext) PushFor(n ir.Node) {
+	ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})
+
+	if base.Flag.LowerM > 1 {
+		fmt.Printf("%v: add FOR to stack %v\n", ir.Line(n), ctx.forsStack)
+	}
+}
+
+func (ctx *inlContext) PopFor() {
+	ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
+}
+
+func (ctx inlContext) InlineBudget() int32 {
+	finalBudget := ctx.initialInlineBudget
+	if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
+		// Boosts only regular functions
+		finalBudget += inlineIntoForExtraBudget
+	}
+
+	return finalBudget
+}
+
+// forCost calculates the cost of FORs. It is used to determine if functions
+// will be boosted to inline into the FOR.
+// We don't want to boost inlining into "big" FORs to keep their body
+// in the instruction cache.
+func forCost(n ir.Node) int32 {
+	exceededCostReason := func(remainingBudget int32) string {
+		return fmt.Sprintf("FOR is big: cost %d exceeds maximum cost %d", inlineBigForCost-remainingBudget, inlineBigForCost)
+	}
+
+	visitor := hairyVisitor{
+		budget:                     inlineBigForCost,
+		extraCallCost:              inlineIntoForExtraCallCost,
+		onlyCost:                   true,
+		exceededCostReasonCallback: exceededCostReason,
+	}
+	visitor.tooHairy(n)
+	return inlineBigForCost - visitor.budget
+}
+
 // InlinePackage finds functions that can be inlined and clones them before walk expands them.
 func InlinePackage() {
 	ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -166,30 +259,40 @@ func CanInline(fn *ir.Func) {
 	// locals, and we use this map to produce a pruned Inline.Dcl
 	// list. See issue 25249 for more context.
 
+	exceededCostReason := func(remainingBudget int32) string {
+		return fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-remainingBudget, inlineVisitorBudget)
+	}
+
 	visitor := hairyVisitor{
-		budget:        inlineMaxBudget,
-		extraCallCost: cc,
+		budget:                     inlineVisitorBudget,
+		extraCallCost:              cc,
+		onlyCost:                   false,
+		exceededCostReasonCallback: exceededCostReason,
 	}
 	if visitor.tooHairy(fn) {
 		reason = visitor.reason
 		return
 	}
 
 	n.Func.Inl = &ir.Inline{
-		Cost: inlineMaxBudget - visitor.budget,
+		Cost: inlineVisitorBudget - visitor.budget,
 		Dcl:  pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
 		Body: inlcopylist(fn.Body),
 
 		CanDelayResults: canDelayResults(fn),
 	}
 
 	if base.Flag.LowerM > 1 {
-		fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
-	} else if base.Flag.LowerM != 0 {
+		if isInlinable(n.Func) {
+			fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		} else {
+			fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		}
+	} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
 		fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
 	}
 	if logopt.Enabled() {
-		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
+		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
 	}
 }
 
@@ -228,20 +331,22 @@ func canDelayResults(fn *ir.Func) bool {
 // hairyVisitor visits a function body to determine its inlining
 // hairiness and whether or not it can be inlined.
 type hairyVisitor struct {
-	budget        int32
-	reason        string
-	extraCallCost int32
-	usedLocals    ir.NameSet
-	do            func(ir.Node) bool
+	budget                     int32
+	extraCallCost              int32
+	onlyCost                   bool // If set, tooHairy does NOT check inlinible nodes, only cost.
+	reason                     string
+	usedLocals                 ir.NameSet
+	do                         func(ir.Node) bool
+	exceededCostReasonCallback func(remainingBudget int32) string
 }
 
-func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
+func (v *hairyVisitor) tooHairy(n ir.Node) bool {
 	v.do = v.doNode // cache closure
-	if ir.DoChildren(fn, v.do) {
+	if ir.DoChildren(n, v.do) {
 		return true
 	}
 	if v.budget < 0 {
-		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
+		v.reason = v.exceededCostReasonCallback(v.budget)
 		return true
 	}
 	return false
@@ -264,8 +369,12 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 			if name.Class == ir.PFUNC && types.IsRuntimePkg(name.Sym().Pkg) {
 				fn := name.Sym().Name
 				if fn == "getcallerpc" || fn == "getcallersp" {
-					v.reason = "call to " + fn
-					return true
+					if !v.onlyCost {
+						v.reason = "call to " + fn
+						return true
+					} else {
+						break
+					}
 				}
 				if fn == "throw" {
 					v.budget -= inlineExtraThrowCost
@@ -309,7 +418,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 			break
 		}
 
-		if fn := inlCallee(n.X); fn != nil && fn.Inl != nil {
+		if fn := inlCallee(n.X); isInlinable(fn) {
 			v.budget -= fn.Inl.Cost
 			break
 		}
@@ -338,13 +447,19 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 	case ir.ORECOVER:
 		// recover matches the argument frame pointer to find
 		// the right panic value, so it needs an argument frame.
-		v.reason = "call to recover"
-		return true
+		if !v.onlyCost {
+			v.reason = "call to recover"
+			return true
+		}
 
 	case ir.OCLOSURE:
 		if base.Debug.InlFuncsWithClosures == 0 {
-			v.reason = "not inlining functions with closures"
-			return true
+			if !v.onlyCost {
+				v.reason = "not inlining functions with closures"
+				return true
+			} else {
+				break
+			}
 		}
 
 		// TODO(danscales): Maybe make budget proportional to number of closure
@@ -355,7 +470,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 		// do) to check for disallowed ops in the body and include the
 		// body in the budget.
 		if doList(n.(*ir.ClosureExpr).Func.Body, v.do) {
-			return true
+			if !v.onlyCost {
+				return true
+			}
 		}
 
 	case ir.ORANGE,
@@ -364,8 +481,10 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 		ir.ODEFER,
 		ir.ODCLTYPE, // can't print yet
 		ir.OTAILCALL:
-		v.reason = "unhandled op " + n.Op().String()
-		return true
+		if !v.onlyCost {
+			v.reason = "unhandled op " + n.Op().String()
+			return true
+		}
 
 	case ir.OAPPEND:
 		v.budget -= inlineExtraAppendCost
@@ -493,20 +612,13 @@ func inlcopy(n ir.Node) ir.Node {
 func InlineCalls(fn *ir.Func) {
 	savefn := ir.CurFunc
 	ir.CurFunc = fn
-	maxCost := int32(inlineMaxBudget)
-	if isBigFunc(fn) {
-		maxCost = inlineBigFunctionMaxCost
-	}
-	// Map to keep track of functions that have been inlined at a particular
-	// call site, in order to stop inlining when we reach the beginning of a
-	// recursion cycle again. We don't inline immediately recursive functions,
-	// but allow inlining if there is a recursion cycle of many functions.
-	// Most likely, the inlining will stop before we even hit the beginning of
-	// the cycle again, but the map catches the unusual case.
-	inlMap := make(map[*ir.Func]bool)
+
+	var inlCtx inlContext
+	inlCtx.Init(fn)
+
 	var edit func(ir.Node) ir.Node
 	edit = func(n ir.Node) ir.Node {
-		return inlnode(n, maxCost, inlMap, edit)
+		return inlnode(n, &inlCtx, edit)
 	}
 	ir.EditChildren(fn, edit)
 	ir.CurFunc = savefn
@@ -525,11 +637,16 @@ func InlineCalls(fn *ir.Func) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if n == nil {
 		return n
 	}
 
+	if n.Op() == ir.OFOR {
+		ctx.PushFor(n)
+		defer ctx.PopFor()
+	}
+
 	switch n.Op() {
 	case ir.ODEFER, ir.OGO:
 		n := n.(*ir.GoDeferStmt)
@@ -587,7 +704,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 			break
 		}
 		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
-			n = mkinlcall(call, fn, maxCost, inlMap, edit)
+			n = mkinlcall(call, fn, ctx, edit)
 		}
 	}
 
@@ -660,20 +777,20 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if fn.Inl == nil {
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
 				fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(fn)))
 		}
 		return n
 	}
-	if fn.Inl.Cost > maxCost {
+	if fn.Inl.Cost > ctx.InlineBudget() {
 		// The inlined function body is too big. Typically we use this check to restrict
 		// inlining into very big functions.  See issue 26546 and 17566.
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
-				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
+				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
 		}
 		return n
 	}
@@ -696,15 +813,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		return n
 	}
 
-	if inlMap[fn] {
+	if ctx.inlinedCallees[fn] {
 		if base.Flag.LowerM > 1 {
 			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
 		}
 		return n
 	}
-	inlMap[fn] = true
+	ctx.inlinedCallees[fn] = true
 	defer func() {
-		inlMap[fn] = false
+		ctx.inlinedCallees[fn] = false
 	}()
 
 	typecheck.FixVariadicCall(n)