From 7de33c316120b99f74b36164f57e5b88288a0496 Mon Sep 17 00:00:00 2001
From: Nikita Melekhin <nimelehin@gmail.com>
Date: Fri, 3 Sep 2021 18:28:14 +0300
Subject: [PATCH] cmd/compile: boost inlining into FORs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As already Than McIntosh mentioned it's a common practise to boost
inlining to FORs, since the callsite could be "hotter". This patch
implements this functionality.

The implementation uses a stack of FORs to recognise calls which are
in a loop. The stack is maintained alongside inlnode function works
and contains information about ancenstor FORs relative to a current
node in inlnode. The forContext contains a liveCounter which shows
for how many nodes this FOR is ancestor.

Current constants are the following:
A "big" FOR is a FOR which contains >=inlineBigForNodes(37) nodes or
has more than inlineBigForCallNodes(3) inlinable call nodes. In such
FORs no boost is applied. Other FORs are considired to be small and
boost callsites with an extra budget equals to inlineExtraForBudget(13).

Updates #17566

The following results on GO1, while binary size not increased significantly
10441232 -> 10465920, which is less than 0.3%.

goos: linux
goarch: amd64
pkg: test/bench/go1
cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz
name                     old time/op    new time/op    delta
 BinaryTree17-8              2.15s ± 1%     2.16s ± 1%    ~     (p=0.132 n=6+6)
 Fannkuch11-8                2.70s ± 0%     2.70s ± 0%  +0.12%  (p=0.004 n=6+5)
 FmtFprintfEmpty-8          31.9ns ± 0%    31.3ns ± 0%  -2.05%  (p=0.008 n=5+5)
 FmtFprintfString-8         57.0ns ± 0%    57.7ns ± 1%  +1.30%  (p=0.002 n=6+6)
 FmtFprintfInt-8            65.2ns ± 0%    64.1ns ± 0%  -1.63%  (p=0.008 n=5+5)
 FmtFprintfIntInt-8          103ns ± 0%     102ns ± 0%  -1.01%  (p=0.000 n=5+6)
 FmtFprintfPrefixedInt-8     119ns ± 0%     119ns ± 0%  +0.31%  (p=0.026 n=5+6)
 FmtFprintfFloat-8           169ns ± 0%     169ns ± 0%  +0.14%  (p=0.008 n=5+5)
 FmtManyArgs-8               445ns ± 0%     452ns ± 0%  +1.39%  (p=0.004 n=6+5)
 GobDecode-8                4.37ms ± 1%    4.42ms ± 1%  +1.03%  (p=0.002 n=6+6)
 GobEncode-8                3.07ms ± 0%    3.03ms ± 0%  -1.07%  (p=0.004 n=5+6)
 Gzip-8                      195ms ± 0%     195ms ± 0%    ~     (p=0.063 n=5+4)
 Gunzip-8                   28.2ms ± 0%    28.8ms ± 0%  +2.13%  (p=0.004 n=5+6)
 HTTPClientServer-8         45.0µs ± 1%    45.4µs ± 1%  +0.94%  (p=0.030 n=6+5)
 JSONEncode-8               8.01ms ± 0%    8.00ms ± 1%    ~     (p=0.429 n=5+6)
 JSONDecode-8               35.3ms ± 1%    35.2ms ± 0%    ~     (p=0.841 n=5+5)
 Mandelbrot200-8            4.50ms ± 0%    4.49ms ± 0%    ~     (p=0.093 n=6+6)
 GoParse-8                  3.03ms ± 1%    2.97ms ± 1%  -1.97%  (p=0.004 n=6+5)
 RegexpMatchEasy0_32-8      55.4ns ± 0%    53.2ns ± 1%  -3.89%  (p=0.008 n=5+5)
 RegexpMatchEasy0_1K-8       178ns ± 0%     162ns ± 1%  -8.72%  (p=0.004 n=5+6)
 RegexpMatchEasy1_32-8      50.1ns ± 0%    47.4ns ± 1%  -5.32%  (p=0.008 n=5+5)
 RegexpMatchEasy1_1K-8       271ns ± 1%     261ns ± 0%  -3.67%  (p=0.002 n=6+6)
 RegexpMatchMedium_32-8      949ns ± 0%     904ns ± 5%  -4.81%  (p=0.004 n=5+6)
 RegexpMatchMedium_1K-8     27.1µs ± 7%    27.3µs ± 6%    ~     (p=0.818 n=6+6)
 RegexpMatchHard_32-8       1.28µs ± 2%    1.27µs ± 1%    ~     (p=0.180 n=6+6)
 RegexpMatchHard_1K-8       38.5µs ± 0%    38.5µs ± 0%    ~     (p=0.329 n=6+5)
 Revcomp-8                   397ms ± 0%     396ms ± 0%  -0.33%  (p=0.026 n=6+6)
 Template-8                 48.1ms ± 1%    48.2ms ± 1%    ~     (p=0.222 n=5+5)
 TimeParse-8                 213ns ± 0%     214ns ± 0%    ~     (p=0.076 n=4+6)
 TimeFormat-8                295ns ± 1%     292ns ± 0%  -1.13%  (p=0.000 n=6+5)
 [Geo mean]                 40.5µs         40.1µs       -0.96%

 name                     old speed      new speed      delta
 GobDecode-8               176MB/s ± 1%   174MB/s ± 1%  -1.02%  (p=0.002 n=6+6)
 GobEncode-8               250MB/s ± 0%   253MB/s ± 0%  +1.08%  (p=0.004 n=5+6)
 Gzip-8                    100MB/s ± 0%   100MB/s ± 0%  +0.23%  (p=0.048 n=5+4)
 Gunzip-8                  687MB/s ± 0%   673MB/s ± 0%  -2.08%  (p=0.004 n=5+6)
 JSONEncode-8              242MB/s ± 0%   243MB/s ± 1%    ~     (p=0.429 n=5+6)
 JSONDecode-8             54.9MB/s ± 1%  55.1MB/s ± 0%    ~     (p=0.873 n=5+5)
 GoParse-8                19.1MB/s ± 1%  19.5MB/s ± 1%  +2.01%  (p=0.004 n=6+5)
 RegexpMatchEasy0_32-8     578MB/s ± 0%   601MB/s ± 1%  +4.06%  (p=0.008 n=5+5)
 RegexpMatchEasy0_1K-8    5.74GB/s ± 1%  6.30GB/s ± 1%  +9.90%  (p=0.002 n=6+6)
 RegexpMatchEasy1_32-8     639MB/s ± 0%   675MB/s ± 1%  +5.63%  (p=0.008 n=5+5)
 RegexpMatchEasy1_1K-8    3.78GB/s ± 1%  3.92GB/s ± 0%  +3.81%  (p=0.002 n=6+6)
 RegexpMatchMedium_32-8   33.7MB/s ± 0%  35.5MB/s ± 5%  +5.30%  (p=0.004 n=5+6)
 RegexpMatchMedium_1K-8   37.9MB/s ± 6%  37.6MB/s ± 5%    ~     (p=0.818 n=6+6)
 RegexpMatchHard_32-8     24.9MB/s ± 2%  25.2MB/s ± 1%    ~     (p=0.167 n=6+6)
 RegexpMatchHard_1K-8     26.6MB/s ± 0%  26.6MB/s ± 0%    ~     (p=0.355 n=6+5)
 Revcomp-8                 640MB/s ± 0%   642MB/s ± 0%  +0.33%  (p=0.026 n=6+6)
 Template-8               40.4MB/s ± 1%  40.2MB/s ± 1%    ~     (p=0.222 n=5+5)
 [Geo mean]                175MB/s        178MB/s       +1.69%
---
 src/cmd/compile/internal/inline/inl.go | 134 ++++++++++++++++++++++---
 test/inline.go                         |  43 ++++++++
 test/inline_for.go                     |  39 +++++++
 3 files changed, 202 insertions(+), 14 deletions(-)
 create mode 100644 test/inline_for.go

diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index d50d8b3516f1ae..3d4a07ad883756 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -51,8 +51,102 @@ const (
 
 	inlineBigFunctionNodes   = 5000 // Functions with this many nodes are considered "big".
 	inlineBigFunctionMaxCost = 20   // Max cost of inlinee when inlining into a "big" function.
+
+	inlineBigForNodes     = 37 // FORs with this many nodes are considered "big" and functions are not forced to be inlined.
+	inlineBigForCallNodes = 3  // FORs with this many call nodes are considered "big" and functions are not forced to be inlined.
+	inlineExtraForBudget  = 13 // Extra budget to inline into not a "big" FOR.
+
+	// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
+	inlineVisitorBudget = inlineMaxBudget + inlineExtraForBudget
 )
 
+type forContext struct {
+	liveCounter int
+	totalNodes  int
+	callNodes   int
+}
+
+type inlContext struct {
+	inlMap    map[*ir.Func]bool
+	forsStack []forContext
+}
+
+// isinlinable checks if the function can be inlined in a 'typical' scenario
+// when no boosts are applied.
+func isinlinable(fn *ir.Func) bool {
+	return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
+}
+
+// countNodes returns count of child nodes and inlinable child call nodes.
+func countInlinableCallNodes(n ir.Node) (int, int) {
+	child_nodes := 0
+	child_inlinable_call_nodes := 0
+	ir.Any(n, func(n ir.Node) bool {
+		child_nodes++
+		switch n.Op() {
+		case ir.OCALLFUNC:
+			call := n.(*ir.CallExpr)
+			if call.NoInline {
+				break
+			}
+			if ir.IsIntrinsicCall(call) {
+				break
+			}
+			if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
+				child_inlinable_call_nodes++
+			}
+		}
+		return false
+	})
+	return child_nodes, child_inlinable_call_nodes
+}
+
+// updateForsStack maintains forsStack, which is used to recognise
+// which call nodes are located inside fors, while doing inlnode.
+func updateForsStack(inlCtx *inlContext, n ir.Node) {
+	outdated := 0
+	for i := len(inlCtx.forsStack) - 1; i >= 0; i-- {
+		inlCtx.forsStack[i].liveCounter--
+		if inlCtx.forsStack[i].liveCounter < 0 {
+			outdated++
+		}
+	}
+	inlCtx.forsStack = inlCtx.forsStack[:len(inlCtx.forsStack)-outdated]
+
+	// If we are in a "big" FOR, it's useless to calculate node count
+	// for this FOR, since no function will be inlined.
+	if n.Op() == ir.OFOR && (len(inlCtx.forsStack) == 0 || ancestorForsAreSmall(inlCtx)) {
+		child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(n)
+		inlCtx.forsStack = append(inlCtx.forsStack, forContext{child_nodes - 1, child_nodes - 1, child_inlinable_call_nodes})
+
+		if base.Flag.LowerM > 1 {
+			fmt.Printf("%v: add for to stack %v\n", ir.Line(n), inlCtx.forsStack)
+		}
+	}
+}
+
+// fixupForsStackAfterInline fixes forsStack after a call node was replaced with inlined node.
+func fixupForsStackAfterInline(inlCtx *inlContext, n ir.Node, call *ir.InlinedCallExpr) {
+	if len(inlCtx.forsStack) == 0 {
+		return
+	}
+
+	child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(call)
+
+	for i := 0; i < len(inlCtx.forsStack); i++ {
+		inlCtx.forsStack[i].liveCounter += child_nodes - 1
+		inlCtx.forsStack[i].callNodes += child_inlinable_call_nodes
+	}
+
+	if base.Flag.LowerM > 1 {
+		fmt.Printf("%v: fixup inline %v\n", ir.Line(n), inlCtx.forsStack)
+	}
+}
+
+func ancestorForsAreSmall(inlCtx *inlContext) bool {
+	return len(inlCtx.forsStack) > 0 && inlCtx.forsStack[0].totalNodes < inlineBigForNodes && inlCtx.forsStack[0].callNodes < inlineBigForCallNodes
+}
+
 // InlinePackage finds functions that can be inlined and clones them before walk expands them.
 func InlinePackage() {
 	ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,7 +261,7 @@ func CanInline(fn *ir.Func) {
 	// list. See issue 25249 for more context.
 
 	visitor := hairyVisitor{
-		budget:        inlineMaxBudget,
+		budget:        inlineVisitorBudget,
 		extraCallCost: cc,
 	}
 	if visitor.tooHairy(fn) {
@@ -176,7 +270,7 @@ func CanInline(fn *ir.Func) {
 	}
 
 	n.Func.Inl = &ir.Inline{
-		Cost: inlineMaxBudget - visitor.budget,
+		Cost: inlineVisitorBudget - visitor.budget,
 		Dcl:  pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
 		Body: inlcopylist(fn.Body),
 
@@ -184,12 +278,16 @@ func CanInline(fn *ir.Func) {
 	}
 
 	if base.Flag.LowerM > 1 {
-		fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
-	} else if base.Flag.LowerM != 0 {
+		if isinlinable(n.Func) {
+			fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		} else {
+			fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		}
+	} else if base.Flag.LowerM != 0 && isinlinable(n.Func) {
 		fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
 	}
 	if logopt.Enabled() {
-		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
+		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
 	}
 }
 
@@ -241,7 +339,7 @@ func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
 		return true
 	}
 	if v.budget < 0 {
-		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
+		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
 		return true
 	}
 	return false
@@ -503,10 +601,11 @@ func InlineCalls(fn *ir.Func) {
 	// but allow inlining if there is a recursion cycle of many functions.
 	// Most likely, the inlining will stop before we even hit the beginning of
 	// the cycle again, but the map catches the unusual case.
-	inlMap := make(map[*ir.Func]bool)
+	inlCtx := inlContext{make(map[*ir.Func]bool), make([]forContext, 0)}
+
 	var edit func(ir.Node) ir.Node
 	edit = func(n ir.Node) ir.Node {
-		return inlnode(n, maxCost, inlMap, edit)
+		return inlnode(n, maxCost, &inlCtx, edit)
 	}
 	ir.EditChildren(fn, edit)
 	ir.CurFunc = savefn
@@ -525,11 +624,16 @@ func InlineCalls(fn *ir.Func) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func inlnode(n ir.Node, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if n == nil {
 		return n
 	}
 
+	if updateForsStack(inlCtx, n); ancestorForsAreSmall(inlCtx) && maxCost == inlineMaxBudget {
+		// Boosts only regular functions
+		maxCost += inlineExtraForBudget
+	}
+
 	switch n.Op() {
 	case ir.ODEFER, ir.OGO:
 		n := n.(*ir.GoDeferStmt)
@@ -584,7 +688,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 			break
 		}
 		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
-			n = mkinlcall(call, fn, maxCost, inlMap, edit)
+			n = mkinlcall(call, fn, maxCost, inlCtx, edit)
 		}
 	}
 
@@ -657,7 +761,7 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if fn.Inl == nil {
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
@@ -693,15 +797,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		return n
 	}
 
-	if inlMap[fn] {
+	if inlCtx.inlMap[fn] {
 		if base.Flag.LowerM > 1 {
 			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
 		}
 		return n
 	}
-	inlMap[fn] = true
+	inlCtx.inlMap[fn] = true
 	defer func() {
-		inlMap[fn] = false
+		inlCtx.inlMap[fn] = false
 	}()
 
 	typecheck.FixVariadicCall(n)
@@ -730,6 +834,8 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		res = oldInline(n, fn, inlIndex)
 	}
 
+	fixupForsStackAfterInline(inlCtx, n, res)
+
 	// transitive inlining
 	// might be nice to do this before exporting the body,
 	// but can't emit the body with inlining expanded.
diff --git a/test/inline.go b/test/inline.go
index a73c0ba7b1da0f..7a139b87520ee5 100644
--- a/test/inline.go
+++ b/test/inline.go
@@ -292,3 +292,46 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
 func conv1(v uint64) uint64 { // ERROR "can inline conv1"
 	return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
 }
+
+// Inline into FORs
+func func_with_cost_88() {
+	x := 200
+	for i := 0; i < x; i++ {
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() {
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+	}
+
+	func_with_cost_88()
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+	}
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			func_with_cost_88()
+			func_with_cost_88()
+			func_with_cost_88()
+		}
+	}
+
+	func_with_cost_88()
+}
diff --git a/test/inline_for.go b/test/inline_for.go
new file mode 100644
index 00000000000000..05abde05846171
--- /dev/null
+++ b/test/inline_for.go
@@ -0,0 +1,39 @@
+// errorcheck -0 -m=2
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test, using compiler diagnostic flags, that inlining is working.
+// Compiles but does not run.
+
+package foo
+
+import "runtime"
+
+func func_with() int { // ERROR "can inline func_with .*"
+	return 10
+}
+
+func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
+	x := 200
+	for i := 0; i < x; i++ { // ERROR "add for to stack \[\{25 25 0\}\]"
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() { // ERROR "cannot inline .*"
+	for { // ERROR "add for to stack \[\{6 6 2\}\]"
+		for { // ERROR "add for to stack \[\{5 6 2\} \{2 2 1\}\]"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "fixup inline \[\{36 6 2\} \{33 2 1\}\]" "add for to stack \[\{29 6 2\} \{26 2 1\} \{25 25 0\}\]"
+		}
+		for { // ERROR "add for to stack"
+			func_with() // ERROR "inlining call to func_with" "fixup inline \[\{10 6 2\} \{10 2 1\}\]"
+		}
+	}
+}