From d68cada7db07688959a08a52022c853ded6fa1a4 Mon Sep 17 00:00:00 2001
From: Nikita Melekhin <nimelehin@gmail.com>
Date: Fri, 3 Sep 2021 18:28:14 +0300
Subject: [PATCH] cmd/compile: boost inlining into FORs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As already Than McIntosh mentioned it's a common practise to boost
inlining to FORs, since the callsite could be "hotter". This patch
implements this functionality.

The implementation uses a stack of FORs to recognise calls which are
in a loop. The stack is maintained alongside inlnode function works
and contains information about ancenstor FORs relative to a current
node in inlnode.

There is "big" FOR which cost is >= inlineBigForCost(105). In such FORs
no boost is applied.

Updates #17566

The following results on GO1, while binary size not increased significantly
10454800 -> 10475120, which is less than 0.3%.

goos: linux
goarch: amd64
pkg: test/bench/go1
cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz
name                     old time/op    new time/op    delta
BinaryTree17-8              2.15s ± 1%     2.17s ± 1%     ~     (p=0.065 n=6+6)
Fannkuch11-8                2.70s ± 0%     2.69s ± 0%   -0.25%  (p=0.010 n=6+4)
FmtFprintfEmpty-8          31.9ns ± 0%    31.4ns ± 0%   -1.61%  (p=0.008 n=5+5)
FmtFprintfString-8         57.0ns ± 0%    57.1ns ± 0%   +0.26%  (p=0.013 n=6+5)
FmtFprintfInt-8            65.2ns ± 0%    63.9ns ± 0%   -1.95%  (p=0.008 n=5+5)
FmtFprintfIntInt-8          103ns ± 0%     102ns ± 0%   -1.01%  (p=0.000 n=5+4)
FmtFprintfPrefixedInt-8     119ns ± 0%     118ns ± 0%   -0.50%  (p=0.008 n=5+5)
FmtFprintfFloat-8           169ns ± 0%     174ns ± 0%   +2.75%  (p=0.008 n=5+5)
FmtManyArgs-8               445ns ± 0%     447ns ± 0%   +0.46%  (p=0.002 n=6+6)
GobDecode-8                4.37ms ± 1%    4.40ms ± 0%   +0.62%  (p=0.009 n=6+6)
GobEncode-8                3.07ms ± 0%    3.04ms ± 0%   -0.78%  (p=0.004 n=5+6)
Gzip-8                      195ms ± 0%     195ms ± 0%     ~     (p=0.429 n=5+6)
Gunzip-8                   28.2ms ± 0%    28.2ms ± 0%     ~     (p=0.662 n=5+6)
HTTPClientServer-8         45.0µs ± 1%    45.4µs ± 1%     ~     (p=0.093 n=6+6)
JSONEncode-8               8.01ms ± 0%    8.03ms ± 0%   +0.31%  (p=0.008 n=5+5)
JSONDecode-8               35.3ms ± 1%    35.1ms ± 0%   -0.72%  (p=0.008 n=5+5)
Mandelbrot200-8            4.50ms ± 0%    4.49ms ± 1%     ~     (p=0.937 n=6+6)
GoParse-8                  3.03ms ± 1%    3.00ms ± 1%     ~     (p=0.180 n=6+6)
RegexpMatchEasy0_32-8      55.4ns ± 0%    53.2ns ± 3%   -3.92%  (p=0.004 n=5+6)
RegexpMatchEasy0_1K-8       178ns ± 0%     175ns ± 1%   -1.57%  (p=0.004 n=5+6)
RegexpMatchEasy1_32-8      50.1ns ± 0%    48.3ns ± 5%     ~     (p=0.082 n=5+6)
RegexpMatchEasy1_1K-8       271ns ± 1%     262ns ± 1%   -3.26%  (p=0.004 n=6+5)
RegexpMatchMedium_32-8      949ns ± 0%     886ns ± 7%     ~     (p=0.329 n=5+6)
RegexpMatchMedium_1K-8     27.1µs ± 7%    28.1µs ± 6%     ~     (p=0.394 n=6+6)
RegexpMatchHard_32-8       1.28µs ± 2%    1.29µs ± 0%     ~     (p=0.056 n=6+6)
RegexpMatchHard_1K-8       38.5µs ± 0%    38.4µs ± 0%   -0.25%  (p=0.009 n=6+5)
Revcomp-8                   397ms ± 0%     396ms ± 0%     ~     (p=0.429 n=6+5)
Template-8                 48.1ms ± 1%    48.1ms ± 0%     ~     (p=0.222 n=5+5)
TimeParse-8                 213ns ± 0%     213ns ± 0%     ~     (p=0.210 n=4+6)
TimeFormat-8                295ns ± 1%     259ns ± 0%  -12.22%  (p=0.002 n=6+6)
[Geo mean]                 40.5µs         40.1µs        -1.00%

name                     old speed      new speed      delta
GobDecode-8               176MB/s ± 1%   174MB/s ± 0%   -0.61%  (p=0.009 n=6+6)
GobEncode-8               250MB/s ± 0%   252MB/s ± 0%   +0.79%  (p=0.004 n=5+6)
Gzip-8                    100MB/s ± 0%   100MB/s ± 0%     ~     (p=0.351 n=5+6)
Gunzip-8                  687MB/s ± 0%   687MB/s ± 0%     ~     (p=0.662 n=5+6)
JSONEncode-8              242MB/s ± 0%   242MB/s ± 0%   -0.31%  (p=0.008 n=5+5)
JSONDecode-8             54.9MB/s ± 1%  55.3MB/s ± 0%   +0.71%  (p=0.008 n=5+5)
GoParse-8                19.1MB/s ± 1%  19.3MB/s ± 1%     ~     (p=0.143 n=6+6)
RegexpMatchEasy0_32-8     578MB/s ± 0%   601MB/s ± 3%   +4.10%  (p=0.004 n=5+6)
RegexpMatchEasy0_1K-8    5.74GB/s ± 1%  5.85GB/s ± 1%   +1.90%  (p=0.002 n=6+6)
RegexpMatchEasy1_32-8     639MB/s ± 0%   663MB/s ± 4%     ~     (p=0.082 n=5+6)
RegexpMatchEasy1_1K-8    3.78GB/s ± 1%  3.91GB/s ± 1%   +3.38%  (p=0.004 n=6+5)
RegexpMatchMedium_32-8   33.7MB/s ± 0%  36.2MB/s ± 7%     ~     (p=0.268 n=5+6)
RegexpMatchMedium_1K-8   37.9MB/s ± 6%  36.5MB/s ± 6%     ~     (p=0.411 n=6+6)
RegexpMatchHard_32-8     24.9MB/s ± 2%  24.8MB/s ± 0%     ~     (p=0.063 n=6+6)
RegexpMatchHard_1K-8     26.6MB/s ± 0%  26.7MB/s ± 0%   +0.25%  (p=0.009 n=6+5)
Revcomp-8                 640MB/s ± 0%   641MB/s ± 0%     ~     (p=0.429 n=6+5)
Template-8               40.4MB/s ± 1%  40.3MB/s ± 0%     ~     (p=0.222 n=5+5)
[Geo mean]                175MB/s        177MB/s        +1.05%
---
 src/cmd/compile/internal/inline/inl.go | 207 +++++++++++++++++++------
 test/inline.go                         |  54 +++++++
 test/inline_for.go                     |  39 +++++
 3 files changed, 255 insertions(+), 45 deletions(-)
 create mode 100644 test/inline_for.go

diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index a2268a5465e02..c4b07b1ddcb66 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -45,14 +45,107 @@ const (
 	inlineMaxBudget       = 80
 	inlineExtraAppendCost = 0
 	// default is to inline if there's at most one call. -l=4 overrides this by using 1 instead.
-	inlineExtraCallCost  = 57              // 57 was benchmarked to provided most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
+	inlineExtraCallCost  = 57              // 57 was benchmarked to provide most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
 	inlineExtraPanicCost = 1               // do not penalize inlining panics.
 	inlineExtraThrowCost = inlineMaxBudget // with current (2018-05/1.11) code, inlining runtime.throw does not help.
 
 	inlineBigFunctionNodes   = 5000 // Functions with this many nodes are considered "big".
 	inlineBigFunctionMaxCost = 20   // Max cost of inlinee when inlining into a "big" function.
+
+	// These values were benchmarked to provide most benefit with no bad surprises.
+	inlineBigForCost           = 105 // FORs with at least this cost are considered "big".
+	inlineIntoForExtraCallCost = 14
+	inlineIntoForExtraBudget   = 18 // Extra budget when inlining into FORs which are not "big".
+
+	// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
+	inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
 )
 
+// isInlinable checks if the function can be inlined in a 'typical' scenario
+// when no boosts are applied.
+func isInlinable(fn *ir.Func) bool {
+	return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
+}
+
+type forContext struct {
+	cost int32 // Helps to determine if FOR is a "big" one.
+}
+
+type inlContext struct {
+	// Map to keep track of functions that have been inlined at a particular
+	// call site, in order to stop inlining when we reach the beginning of a
+	// recursion cycle again. We don't inline immediately recursive functions,
+	// but allow inlining if there is a recursion cycle of many functions.
+	// Most likely, the inlining will stop before we even hit the beginning of
+	// the cycle again, but the map catches the unusual case.
+	inlinedCallees map[*ir.Func]bool
+
+	// Stack to recognise which call nodes are located inside fors, while doing inlnode.
+	forsStack           []forContext
+	initialInlineBudget int32 // Initial inline budget. Boosts are calculated related to this.
+}
+
+// Current decision is made on whether all FORs in current scope are not "big".
+func (ctx inlContext) canBoostInliningIntoFor() bool {
+	for i := 0; i < len(ctx.forsStack); i++ {
+		if ctx.forsStack[i].cost >= inlineBigForCost {
+			return false
+		}
+	}
+	return len(ctx.forsStack) > 0
+}
+
+func (ctx *inlContext) Init(fn *ir.Func) {
+	ctx.inlinedCallees = make(map[*ir.Func]bool)
+
+	if isBigFunc(fn) {
+		ctx.initialInlineBudget = inlineBigFunctionMaxCost
+	} else {
+		ctx.initialInlineBudget = inlineMaxBudget
+	}
+}
+
+func (ctx *inlContext) PushFor(n ir.Node) {
+	ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})
+
+	if base.Flag.LowerM > 1 {
+		fmt.Printf("%v: add FOR to stack %v\n", ir.Line(n), ctx.forsStack)
+	}
+}
+
+func (ctx *inlContext) PopFor() {
+	ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
+}
+
+func (ctx inlContext) InlineBudget() int32 {
+	finalBudget := ctx.initialInlineBudget
+	if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
+		// Boosts only regular functions
+		finalBudget += inlineIntoForExtraBudget
+	}
+
+	return finalBudget
+}
+
+// forCost calculates the cost of FORs. It is used to determine if functions
+// will be boosted to inline into the FOR.
+// We don't want to boost inlining into "big" FORs to keep their body
+// in the instruction cache.
+func forCost(n ir.Node) int32 {
+	exceededCostReason := func(remainingBudget int32) string {
+		return fmt.Sprintf("FOR is big: cost %d exceeds maximum cost %d", inlineBigForCost-remainingBudget, inlineBigForCost)
+	}
+
+	visitor := hairyVisitor{
+		budget:                     inlineBigForCost,
+		extraCallCost:              inlineIntoForExtraCallCost,
+		onlyCost:                   true,
+		exceededCostReasonCallback: exceededCostReason,
+	}
+	visitor.tooHairy(n)
+	return inlineBigForCost - visitor.budget
+}
+
 // InlinePackage finds functions that can be inlined and clones them before walk expands them.
 func InlinePackage() {
 	ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -166,9 +259,15 @@ func CanInline(fn *ir.Func) {
 	// locals, and we use this map to produce a pruned Inline.Dcl
 	// list. See issue 25249 for more context.
 
+	exceededCostReason := func(remainingBudget int32) string {
+		return fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-remainingBudget, inlineVisitorBudget)
+	}
+
 	visitor := hairyVisitor{
-		budget:        inlineMaxBudget,
-		extraCallCost: cc,
+		budget:                     inlineVisitorBudget,
+		extraCallCost:              cc,
+		onlyCost:                   false,
+		exceededCostReasonCallback: exceededCostReason,
 	}
 	if visitor.tooHairy(fn) {
 		reason = visitor.reason
@@ -176,7 +275,7 @@ func CanInline(fn *ir.Func) {
 	}
 
 	n.Func.Inl = &ir.Inline{
-		Cost: inlineMaxBudget - visitor.budget,
+		Cost: inlineVisitorBudget - visitor.budget,
 		Dcl:  pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
 		Body: inlcopylist(fn.Body),
 
@@ -184,12 +283,16 @@ func CanInline(fn *ir.Func) {
 	}
 
 	if base.Flag.LowerM > 1 {
-		fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
-	} else if base.Flag.LowerM != 0 {
+		if isInlinable(n.Func) {
+			fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		} else {
+			fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		}
+	} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
 		fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
 	}
 	if logopt.Enabled() {
-		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
+		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
 	}
 }
 
@@ -228,20 +331,22 @@ func canDelayResults(fn *ir.Func) bool {
 // hairyVisitor visits a function body to determine its inlining
 // hairiness and whether or not it can be inlined.
 type hairyVisitor struct {
-	budget        int32
-	reason        string
-	extraCallCost int32
-	usedLocals    ir.NameSet
-	do            func(ir.Node) bool
+	budget                     int32
+	extraCallCost              int32
+	onlyCost                   bool // If set, tooHairy does NOT check inlinible nodes, only cost.
+	reason                     string
+	usedLocals                 ir.NameSet
+	do                         func(ir.Node) bool
+	exceededCostReasonCallback func(remainingBudget int32) string
 }
 
-func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
+func (v *hairyVisitor) tooHairy(n ir.Node) bool {
 	v.do = v.doNode // cache closure
-	if ir.DoChildren(fn, v.do) {
+	if ir.DoChildren(n, v.do) {
 		return true
 	}
 	if v.budget < 0 {
-		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
+		v.reason = v.exceededCostReasonCallback(v.budget)
 		return true
 	}
 	return false
@@ -264,8 +369,12 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 			if name.Class == ir.PFUNC && types.IsRuntimePkg(name.Sym().Pkg) {
 				fn := name.Sym().Name
 				if fn == "getcallerpc" || fn == "getcallersp" {
-					v.reason = "call to " + fn
-					return true
+					if !v.onlyCost {
+						v.reason = "call to " + fn
+						return true
+					} else {
+						break
+					}
 				}
 				if fn == "throw" {
 					v.budget -= inlineExtraThrowCost
@@ -309,7 +418,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 			break
 		}
 
-		if fn := inlCallee(n.X); fn != nil && fn.Inl != nil {
+		if fn := inlCallee(n.X); isInlinable(fn) {
 			v.budget -= fn.Inl.Cost
 			break
 		}
@@ -338,13 +447,19 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 	case ir.ORECOVER:
 		// recover matches the argument frame pointer to find
 		// the right panic value, so it needs an argument frame.
-		v.reason = "call to recover"
-		return true
+		if !v.onlyCost {
+			v.reason = "call to recover"
+			return true
+		}
 
 	case ir.OCLOSURE:
 		if base.Debug.InlFuncsWithClosures == 0 {
-			v.reason = "not inlining functions with closures"
-			return true
+			if !v.onlyCost {
+				v.reason = "not inlining functions with closures"
+				return true
+			} else {
+				break
+			}
 		}
 
 		// TODO(danscales): Maybe make budget proportional to number of closure
@@ -355,7 +470,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 		// do) to check for disallowed ops in the body and include the
 		// body in the budget.
 		if doList(n.(*ir.ClosureExpr).Func.Body, v.do) {
-			return true
+			if !v.onlyCost {
+				return true
+			}
 		}
 
 	case ir.ORANGE,
@@ -364,8 +481,10 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 		ir.ODEFER,
 		ir.ODCLTYPE, // can't print yet
 		ir.OTAILCALL:
-		v.reason = "unhandled op " + n.Op().String()
-		return true
+		if !v.onlyCost {
+			v.reason = "unhandled op " + n.Op().String()
+			return true
+		}
 
 	case ir.OAPPEND:
 		v.budget -= inlineExtraAppendCost
@@ -493,20 +612,13 @@ func inlcopy(n ir.Node) ir.Node {
 func InlineCalls(fn *ir.Func) {
 	savefn := ir.CurFunc
 	ir.CurFunc = fn
-	maxCost := int32(inlineMaxBudget)
-	if isBigFunc(fn) {
-		maxCost = inlineBigFunctionMaxCost
-	}
-	// Map to keep track of functions that have been inlined at a particular
-	// call site, in order to stop inlining when we reach the beginning of a
-	// recursion cycle again. We don't inline immediately recursive functions,
-	// but allow inlining if there is a recursion cycle of many functions.
-	// Most likely, the inlining will stop before we even hit the beginning of
-	// the cycle again, but the map catches the unusual case.
-	inlMap := make(map[*ir.Func]bool)
+
+	var inlCtx inlContext
+	inlCtx.Init(fn)
+
 	var edit func(ir.Node) ir.Node
 	edit = func(n ir.Node) ir.Node {
-		return inlnode(n, maxCost, inlMap, edit)
+		return inlnode(n, &inlCtx, edit)
 	}
 	ir.EditChildren(fn, edit)
 	ir.CurFunc = savefn
@@ -525,11 +637,16 @@ func InlineCalls(fn *ir.Func) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if n == nil {
 		return n
 	}
 
+	if n.Op() == ir.OFOR {
+		ctx.PushFor(n)
+		defer ctx.PopFor()
+	}
+
 	switch n.Op() {
 	case ir.ODEFER, ir.OGO:
 		n := n.(*ir.GoDeferStmt)
@@ -587,7 +704,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 			break
 		}
 		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
-			n = mkinlcall(call, fn, maxCost, inlMap, edit)
+			n = mkinlcall(call, fn, ctx, edit)
 		}
 	}
 
@@ -660,7 +777,7 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if fn.Inl == nil {
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
@@ -668,12 +785,12 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		}
 		return n
 	}
-	if fn.Inl.Cost > maxCost {
+	if fn.Inl.Cost > ctx.InlineBudget() {
 		// The inlined function body is too big. Typically we use this check to restrict
 		// inlining into very big functions.  See issue 26546 and 17566.
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
-				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
+				fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
 		}
 		return n
 	}
@@ -696,15 +813,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		return n
 	}
 
-	if inlMap[fn] {
+	if ctx.inlinedCallees[fn] {
 		if base.Flag.LowerM > 1 {
 			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
 		}
 		return n
 	}
-	inlMap[fn] = true
+	ctx.inlinedCallees[fn] = true
 	defer func() {
-		inlMap[fn] = false
+		ctx.inlinedCallees[fn] = false
 	}()
 
 	typecheck.FixVariadicCall(n)
diff --git a/test/inline.go b/test/inline.go
index 599d5233e0f3f..51a9011c0ca13 100644
--- a/test/inline.go
+++ b/test/inline.go
@@ -290,3 +290,57 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
 func conv1(v uint64) uint64 { // ERROR "can inline conv1"
 	return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
 }
+
+// Inline into FORs
+func func_with_cost_88() {
+	x := 200
+	for i := 0; i < x; i++ {
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() {
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+	}
+
+	func_with_cost_88()
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+	}
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+	}
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			// Calls can't be inlined, since the outher FOR is a "big" one.
+			func_with_cost_88()
+			func_with_cost_88()
+			func_with_cost_88()
+			func_with_cost_88()
+			for j := 0; j < 100; j++ {
+				func_with_cost_88()
+				func_with_cost_88()
+			}
+		}
+	}
+
+	func_with_cost_88()
+}
diff --git a/test/inline_for.go b/test/inline_for.go
new file mode 100644
index 0000000000000..67778feaf2af0
--- /dev/null
+++ b/test/inline_for.go
@@ -0,0 +1,39 @@
+// errorcheck -0 -m=2
+
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test, using compiler diagnostic flags, that inlining is working.
+// Compiles but does not run.
+
+package foo
+
+import "runtime"
+
+func func_with() int { // ERROR "can inline func_with .*"
+	return 10
+}
+
+func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
+	x := 200
+	for i := 0; i < x; i++ { // ERROR "add FOR to stack \[\{39\}\]"
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() { // ERROR "can inline .*"
+	for { // ERROR "add FOR to stack \[\{22\}\]"
+		for { // ERROR "add FOR to stack \[\{22\} \{16\}\]"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "add FOR to stack \[\{22\} \{16\} \{39\}\]"
+		}
+		for { // ERROR "add FOR to stack"
+			func_with() // ERROR "inlining call to func_with"
+		}
+	}
+}