diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index 50f9aae73dc53..0629a4789e59c 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -217,6 +217,9 @@ class InstCostVisitor : public InstVisitor { Cost estimateSwitchInst(SwitchInst &I); Cost estimateBranchInst(BranchInst &I); + void discoverTransitivelyIncomingValues(DenseSet &PhiNodes, + PHINode *PN, unsigned Depth); + Constant *visitInstruction(Instruction &I) { return nullptr; } Constant *visitPHINode(PHINode &I); Constant *visitFreezeInst(FreezeInst &I); diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index b75ca7761a60b..2f8632e760a9b 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -39,10 +39,15 @@ static cl::opt MaxClones( "The maximum number of clones allowed for a single function " "specialization")); +static cl::opt MaxDiscoveryDepth( + "funcspec-max-discovery-depth", cl::init(10), cl::Hidden, + cl::desc("The maximum recursion depth allowed when searching for strongly " + "connected phis")); + static cl::opt MaxIncomingPhiValues( - "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc( - "The maximum number of incoming values a PHI node can have to be " - "considered during the specialization bonus estimation")); + "funcspec-max-incoming-phi-values", cl::init(8), cl::Hidden, + cl::desc("The maximum number of incoming values a PHI node can have to be " + "considered during the specialization bonus estimation")); static cl::opt MaxBlockPredecessors( "funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc( @@ -64,9 +69,9 @@ static cl::opt MinCodeSizeSavings( "much percent of the original function size")); static cl::opt MinLatencySavings( - "funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc( - "Reject specializations whose latency savings are less than this" - "much percent of the original function size")); + "funcspec-min-latency-savings", cl::init(40), cl::Hidden, + cl::desc("Reject specializations whose latency savings are less than this" + "much percent of the original function size")); static cl::opt MinInliningBonus( "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( @@ -262,30 +267,170 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { return estimateBasicBlocks(WorkList); } +// This function is finding candidates for a PHINode is part of a chain or graph +// of PHINodes that all link to each other. That means, if the original input to +// the chain is a constant all the other values are also that constant. +// +// The caller of this function will later check that no other nodes are involved +// that are non-constant, and discard it from the possible conversions. +// +// For example: +// +// %a = load %0 +// %c = phi [%a, %d] +// %d = phi [%e, %c] +// %e = phi [%c, %f] +// %f = phi [%j, %h] +// %j = phi [%h, %j] +// %h = phi [%g, %c] +// +// This is only showing the PHINodes, not the branches that choose the +// different paths. +// +// A depth limit is used to avoid extreme recurusion. +// A max number of incoming phi values ensures that expensive searches +// are avoided. +void InstCostVisitor::discoverTransitivelyIncomingValues( + DenseSet &PHINodes, PHINode *PN, unsigned Depth) { + if (Depth > MaxDiscoveryDepth) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Discover PHI nodes too deep (" + << Depth << ">" << MaxDiscoveryDepth << ")\n"); + return; + } + + if (PN->getNumIncomingValues() > MaxIncomingPhiValues) { + LLVM_DEBUG( + dbgs() << "FnSpecialization: Discover PHI nodes has too many values (" + << PN->getNumIncomingValues() << ">" << MaxIncomingPhiValues + << ")\n"); + return; + } + + // Already seen this, no more processing needed. + if (!PHINodes.insert(PN).second) + return; + + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + Value *V = PN->getIncomingValue(I); + if (auto *Phi = dyn_cast(V)) { + if (Phi == PN || DeadBlocks.contains(PN->getIncomingBlock(I))) + continue; + discoverTransitivelyIncomingValues(PHINodes, Phi, Depth + 1); + } + } +} + Constant *InstCostVisitor::visitPHINode(PHINode &I) { if (I.getNumIncomingValues() > MaxIncomingPhiValues) return nullptr; + // PHI nodes + DenseSet TransitivePHIs; + bool Inserted = VisitedPHIs.insert(&I).second; - Constant *Const = nullptr; + SmallVector UnknownIncomingValues; + + auto canConstantFoldPhiTrivially = [&](PHINode *PN) -> Constant * { + Constant *Const = nullptr; + + UnknownIncomingValues.clear(); + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + Value *V = PN->getIncomingValue(I); - for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) { - Value *V = I.getIncomingValue(Idx); - if (auto *Inst = dyn_cast(V)) - if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx))) + // Disregard self-references and dead incoming values. + if (auto *Inst = dyn_cast(V)) + if (Inst == PN || DeadBlocks.contains(PN->getIncomingBlock(I))) + continue; + + if (Constant *C = findConstantFor(V, KnownConstants)) { + if (!Const) + Const = C; + // Not all incoming values are the same constant. Bail immediately. + if (C != Const) + return nullptr; continue; - Constant *C = findConstantFor(V, KnownConstants); - if (!C) { - if (Inserted) - PendingPHIs.push_back(&I); + } + if (auto *Phi = dyn_cast(V)) { + UnknownIncomingValues.push_back(Phi); + continue; + } + + // We can't reason about anything else. return nullptr; } - if (!Const) - Const = C; - else if (C != Const) + return UnknownIncomingValues.empty() ? Const : nullptr; + }; + + if (Constant *Const = canConstantFoldPhiTrivially(&I)) + return Const; + + if (Inserted) { + // First time we are seeing this phi. We'll retry later, after all + // the constant arguments have been propagated. Bail for now. + PendingPHIs.push_back(&I); + return nullptr; + } + + // Try to see if we can collect a nest of transitive phis. + for (PHINode *Phi : UnknownIncomingValues) + discoverTransitivelyIncomingValues(TransitivePHIs, Phi, 1); + + // A nested set of PHINodes can be constantfolded if: + // - It has a constant input. + // - It is always the SAME constant. + // - All the nodes are part of the nest, or a constant. + // Later we will check that the constant is always the same one. + auto canConstantFoldNestedPhi = [&](PHINode *PN, Constant *&Const) -> bool { + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + Value *V = PN->getIncomingValue(I); + // Disregard self-references and dead incoming values. + if (auto *Inst = dyn_cast(V)) + if (Inst == PN || DeadBlocks.contains(PN->getIncomingBlock(I))) + continue; + + if (Constant *C = findConstantFor(V, KnownConstants)) { + if (!Const) + Const = C; + + // Not all incoming values are the same constant. Bail immediately. + if (C != Const) + return false; + continue; + } + if (auto *Phi = dyn_cast(V)) { + // It's not a Transitive phi. Bail out. + if (!TransitivePHIs.contains(Phi)) + return false; + continue; + } + + // We can't reason about anything else. + return false; + } + return true; + }; + + // All TransitivePHIs have to be the SAME constant. + Constant *Retval = nullptr; + for (PHINode *Phi : TransitivePHIs) { + Constant *Const = nullptr; + if (canConstantFoldNestedPhi(Phi, Const)) { + if (Const) { + if (!Retval) { + Retval = Const; + continue; + } + // Found more than one constant, can't fold. + if (Retval != Const) + return nullptr; + } + } + // Found something "wrong", can't fold. + else return nullptr; } - return Const; + + return Retval; } Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { @@ -809,20 +954,41 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize, unsigned FuncGrowth) -> bool { // No check required. - if (ForceSpecialization) + if (ForceSpecialization) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Force is on\n"); return true; + } // Minimum inlining bonus. - if (Score > MinInliningBonus * FuncSize / 100) + if (Score > MinInliningBonus * FuncSize / 100) { + LLVM_DEBUG(dbgs() + << "FnSpecialization: Sufficient inlining bonus (" << Score + << " > " << MinInliningBonus * FuncSize / 100 << ")\n"); return true; + } // Minimum codesize savings. - if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) + if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) { + LLVM_DEBUG(dbgs() + << "FnSpecialization: Insufficient CodeSize Savings (" + << B.CodeSize << " < " + << MinCodeSizeSavings * FuncSize / 100 << ")\n"); return false; + } // Minimum latency savings. - if (B.Latency < MinLatencySavings * FuncSize / 100) + if (B.Latency < MinLatencySavings * FuncSize / 100) { + LLVM_DEBUG(dbgs() + << "FnSpecialization: Insufficient Latency Savings (" + << B.Latency << " < " << MinLatencySavings * FuncSize / 100 + << ")\n"); return false; + } // Maximum codesize growth. - if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) + if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) { + LLVM_DEBUG(dbgs() + << "FnSpecialization: Function Growth exceeds threshold (" + << FuncGrowth / FuncSize << " > " << MaxCodeSizeGrowth + << ")\n"); return false; + } return true; }; diff --git a/llvm/test/Transforms/FunctionSpecialization/discover-nested-phis.ll b/llvm/test/Transforms/FunctionSpecialization/discover-nested-phis.ll new file mode 100644 index 0000000000000..3463ddb6f066d --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/discover-nested-phis.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=20 -funcspec-for-literal-constant -S < %s | FileCheck %s --check-prefix=FUNCSPEC +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=20 -funcspec-for-literal-constant -funcspec-max-discovery-depth=5 -S < %s | FileCheck %s --check-prefix=NOFUNCSPEC + +define i64 @bar(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) { +; FUNCSPEC-LABEL: define i64 @bar( +; FUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) { +; FUNCSPEC-NEXT: entry: +; FUNCSPEC-NEXT: [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]] +; FUNCSPEC-NEXT: [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG1:![0-9]+]] +; FUNCSPEC-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]] +; FUNCSPEC-NEXT: ret i64 [[ADD]] +; +; NOFUNCSPEC-LABEL: define i64 @bar( +; NOFUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) { +; NOFUNCSPEC-NEXT: entry: +; NOFUNCSPEC-NEXT: [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]] +; NOFUNCSPEC-NEXT: [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0]] +; NOFUNCSPEC-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]] +; NOFUNCSPEC-NEXT: ret i64 [[ADD]] +; +entry: + %f1 = call i64 @foo(i64 3, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) + %f2 = call i64 @foo(i64 4, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) + %add = add i64 %f1, %f2 + ret i64 %add +} + +define internal i64 @foo(i64 %n, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) { +entry: + br i1 %c1, label %l1, label %l9 + +l1: + %phi1 = phi i64 [ %n, %entry ], [ %phi2, %l2 ] + %add = add i64 %phi1, 1 + %div = sdiv i64 %add, 2 + br i1 %c2, label %l1_5, label %exit + +l1_5: + br i1 %c3, label %l1_75, label %l6 + +l1_75: + br i1 %c4, label %l2, label %l3 + +l2: + %phi2 = phi i64 [ %phi1, %l1_75 ], [ %phi3, %l3 ] + br label %l1 + +l3: + %phi3 = phi i64 [ %phi1, %l1_75 ], [ %phi4, %l4 ] + br label %l2 + +l4: + %phi4 = phi i64 [ %phi5, %l5 ], [ %phi6, %l6 ] + br i1 %c5, label %l3, label %l6 + +l5: + %phi5 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ] + br label %l4 + +l6: + %phi6 = phi i64 [ %phi4, %l4 ], [ %phi1, %l1_5 ] + br i1 %c6, label %l4, label %l6_5 + +l6_5: + br i1 %c7, label %l5, label %l8 + +l7: + %phi7 = phi i64 [ %phi9, %l9 ], [ %phi8, %l8 ] + br i1 %c8, label %l5, label %l8 + +l8: + %phi8 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ] + br i1 %c9, label %l7, label %l9 + +l9: + %phi9 = phi i64 [ %n, %entry ], [ %phi8, %l8 ] + %sub = sub i64 %phi9, 1 + %mul = mul i64 %sub, 2 + br i1 %c10, label %l7, label %exit + +exit: + %res = phi i64 [ %div, %l1 ], [ %mul, %l9] + ret i64 %res +} +