diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0a10b51f97c63..efd7580e10033 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -594,12 +594,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Check if suitable for a bit test if (N <= DL.getIndexSizeInBits(0u)) { - SmallPtrSet Dests; - for (auto I : SI.cases()) - Dests.insert(I.getCaseSuccessor()); + DenseMap DestMap; + for (auto I : SI.cases()) { + const BasicBlock *BB = I.getCaseSuccessor(); + DestMap[BB] = DestMap.count(BB) ? (DestMap.count(BB) + 1) : 1; + } - if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, - DL)) + if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL)) return 1; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 438b6ff55c85f..4c94ed34123c7 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1441,9 +1441,10 @@ class LLVM_ABI TargetLoweringBase { /// \p High as its lowest and highest case values, and expects \p NumCmps /// case value comparisons. Check if the number of destinations, comparison /// metric, and range are all suitable. - bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, - const APInt &Low, const APInt &High, - const DataLayout &DL) const { + bool + isSuitableForBitTests(DenseMap DestCmps, + const APInt &Low, const APInt &High, + const DataLayout &DL) const { // FIXME: I don't think NumCmps is the correct metric: a single case and a // range of cases both require only one branch to lower. Just looking at the // number of clusters and destinations should be enough to decide whether to @@ -1454,6 +1455,20 @@ class LLVM_ABI TargetLoweringBase { if (!rangeFitsInWord(Low, High, DL)) return false; + unsigned NumDests = DestCmps.size(); + unsigned NumCmps = 0; + unsigned int MaxBitTestEntry = 0; + for (auto &DestCmp : DestCmps) { + NumCmps += DestCmp.second; + if (DestCmp.second > MaxBitTestEntry) + MaxBitTestEntry = DestCmp.second; + } + + // Comparisons might be cheaper for small number of comparisons, which can + // be Arch Target specific. + if (MaxBitTestEntry < getMinimumBitTestCmps()) + return false; + // Decide whether it's profitable to lower this range with bit tests. Each // destination requires a bit test and branch, and there is an overall range // check branch. For a small number of clusters, separate comparisons might @@ -2063,6 +2078,9 @@ class LLVM_ABI TargetLoweringBase { virtual bool isJumpTableRelative() const; + /// Retuen the minimum of largest number of comparisons in BitTest. + virtual unsigned getMinimumBitTestCmps() const; + /// If a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. Register getStackPointerRegisterToSaveRestore() const { @@ -2579,6 +2597,9 @@ class LLVM_ABI TargetLoweringBase { /// Set to zero to generate unlimited jump tables. void setMaximumJumpTableSize(unsigned); + /// Set the minimum of largest of number of comparisons to generate BitTest. + void setMinimumBitTestCmps(unsigned Val); + /// If set to a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. void setStackPointerRegisterToSaveRestore(Register R) { diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp index 038c499fe236e..8662f7801e2a6 100644 --- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp +++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp @@ -206,12 +206,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, for (unsigned I = First; I <= Last; ++I) JTProbs[Clusters[I].MBB] = BranchProbability::getZero(); + DenseMap DestMap; for (unsigned I = First; I <= Last; ++I) { assert(Clusters[I].Kind == CC_Range); Prob += Clusters[I].Prob; const APInt &Low = Clusters[I].Low->getValue(); const APInt &High = Clusters[I].High->getValue(); - NumCmps += (Low == High) ? 1 : 2; + unsigned int NumCmp = (Low == High) ? 1 : 2; + const BasicBlock *BB = Clusters[I].MBB->getBasicBlock(); + DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp; + if (I != First) { // Fill the gap between this and the previous cluster. const APInt &PreviousHigh = Clusters[I - 1].High->getValue(); @@ -226,9 +230,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, JTProbs[Clusters[I].MBB] += Clusters[I].Prob; } - unsigned NumDests = JTProbs.size(); - if (TLI->isSuitableForBitTests(NumDests, NumCmps, - Clusters[First].Low->getValue(), + if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(), Clusters[Last].High->getValue(), *DL)) { // Clusters[First..Last] should be lowered as bit tests instead. return false; @@ -372,20 +374,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters, if (First == Last) return false; - BitVector Dests(FuncInfo.MF->getNumBlockIDs()); - unsigned NumCmps = 0; + DenseMap DestMap; for (int64_t I = First; I <= Last; ++I) { assert(Clusters[I].Kind == CC_Range); - Dests.set(Clusters[I].MBB->getNumber()); - NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2; + unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2; + const BasicBlock *BB = Clusters[I].MBB->getBasicBlock(); + DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp; } - unsigned NumDests = Dests.count(); APInt Low = Clusters[First].Low->getValue(); APInt High = Clusters[Last].High->getValue(); assert(Low.slt(High)); - if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL)) + if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL)) return false; APInt LowBound; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 0549a947600dc..eeafe0605d654 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -90,6 +91,11 @@ static cl::opt OptsizeJumpTableDensity( cl::desc("Minimum density for building a jump table in " "an optsize function")); +static cl::opt + MinimumBitTestCmps("min-bit-test-cmps", cl::init(2), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons " + "to use bit test for switch.")); + // FIXME: This option is only to test if the strict fp operation processed // correctly by preventing mutating strict fp operation to normal fp operation // during development. When the backend supports strict float operation, this @@ -2120,6 +2126,14 @@ bool TargetLoweringBase::isJumpTableRelative() const { return getTargetMachine().isPositionIndependent(); } +unsigned TargetLoweringBase::getMinimumBitTestCmps() const { + return MinimumBitTestCmps; +} + +void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) { + MinimumBitTestCmps = Val; +} + Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const { if (TM.Options.LoopAlignment) return Align(TM.Options.LoopAlignment); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5039f5df7a128..0e83599f9c366 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -138,6 +138,11 @@ static cl::opt PPCMinimumJumpTableEntries( "ppc-min-jump-table-entries", cl::init(64), cl::Hidden, cl::desc("Set minimum number of entries to use a jump table on PPC")); +static cl::opt PPCMinimumBitTestCmps( + "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons to use bit test for " + "switch on PPC.")); + static cl::opt PPCGatherAllAliasesMaxDepth( "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden, cl::desc("max depth when checking alias info in GatherAllAliases()")); @@ -1438,6 +1443,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Re-evaluate this value on future HWs that can do better with mtctr. setMinimumJumpTableEntries(PPCMinimumJumpTableEntries); + // The default minimum of largest number in a BitTest cluster is 3. + setMinimumBitTestCmps(PPCMinimumBitTestCmps); + setMinFunctionAlignment(Align(4)); setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32); diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll new file mode 100644 index 0000000000000..cba56e3d5798f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/bittest.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names | FileCheck %s + +define i32 @foo(i32 noundef signext %x) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stwu r1, -64(r1) +; CHECK-NEXT: stw r0, 72(r1) +; CHECK-NEXT: cmpwi r3, 8 +; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill +; CHECK-NEXT: mr r31, r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: ble cr0, L..BB0_4 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: cmpwi r31, 11 +; CHECK-NEXT: bge cr0, L..BB0_7 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cmplwi r31, 9 +; CHECK-NEXT: beq cr0, L..BB0_9 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: cmplwi r31, 10 +; CHECK-NEXT: beq cr0, L..BB0_11 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_4: # %entry +; CHECK-NEXT: cmplwi r31, 4 +; CHECK-NEXT: beq cr0, L..BB0_12 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: cmplwi r31, 7 +; CHECK-NEXT: beq cr0, L..BB0_11 +; CHECK-NEXT: # %bb.6: # %entry +; CHECK-NEXT: cmplwi r31, 8 +; CHECK-NEXT: beq cr0, L..BB0_10 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_7: # %entry +; CHECK-NEXT: beq cr0, L..BB0_10 +; CHECK-NEXT: # %bb.8: # %entry +; CHECK-NEXT: cmplwi r31, 12 +; CHECK-NEXT: bne cr0, L..BB0_13 +; CHECK-NEXT: L..BB0_9: # %sw.bb2 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo3[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_10: # %sw.bb1 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo2[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_11: # %sw.bb +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo1[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_12: # %sw.bb3 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: bl .foo4[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: L..BB0_13: # %return +; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 64 +; CHECK-NEXT: lwz r0, 8(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + switch i32 %x, label %return [ + i32 7, label %sw.bb + i32 10, label %sw.bb + i32 8, label %sw.bb1 + i32 11, label %sw.bb1 + i32 9, label %sw.bb2 + i32 12, label %sw.bb2 + i32 4, label %sw.bb3 + ] + +sw.bb: ; preds = %entry, %entry + tail call void @foo1(i32 noundef signext %x) + br label %return + +sw.bb1: ; preds = %entry, %entry + tail call void @foo2(i32 noundef signext %x) + br label %return + +sw.bb2: ; preds = %entry, %entry + tail call void @foo3(i32 noundef signext %x) + br label %return + +sw.bb3: ; preds = %entry + tail call void @foo4(i32 noundef signext 4) + br label %return + +return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry + %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ] + ret i32 %retval.0 +} + +define i32 @goo(i32 noundef signext %x) { +; CHECK-LABEL: goo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stwu r1, -64(r1) +; CHECK-NEXT: stw r0, 72(r1) +; CHECK-NEXT: cmplwi r3, 12 +; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill +; CHECK-NEXT: mr r31, r3 +; CHECK-NEXT: bgt cr0, L..BB1_7 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: slw r3, r3, r31 +; CHECK-NEXT: andi. r4, r3, 5632 +; CHECK-NEXT: bne cr0, L..BB1_4 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: andi. r3, r3, 2304 +; CHECK-NEXT: beq cr0, L..BB1_5 +; CHECK-NEXT: # %bb.3: # %sw.bb1 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo2[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_4: # %sw.bb2 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo3[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_5: # %entry +; CHECK-NEXT: cmplwi r31, 7 +; CHECK-NEXT: bne cr0, L..BB1_7 +; CHECK-NEXT: # %bb.6: # %sw.bb +; CHECK-NEXT: li r3, 7 +; CHECK-NEXT: li r31, 7 +; CHECK-NEXT: bl .foo1[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_7: # %entry +; CHECK-NEXT: cmplwi r31, 4 +; CHECK-NEXT: li r31, 0 +; CHECK-NEXT: bne cr0, L..BB1_9 +; CHECK-NEXT: # %bb.8: # %sw.bb3 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: li r31, 4 +; CHECK-NEXT: bl .foo4[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: L..BB1_9: # %return +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 64 +; CHECK-NEXT: lwz r0, 8(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + switch i32 %x, label %return [ + i32 7, label %sw.bb + i32 8, label %sw.bb1 + i32 11, label %sw.bb1 + i32 9, label %sw.bb2 + i32 10, label %sw.bb2 + i32 12, label %sw.bb2 + i32 4, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + tail call void @foo1(i32 noundef signext 7) + br label %return + +sw.bb1: ; preds = %entry, %entry + tail call void @foo2(i32 noundef signext %x) + br label %return + +sw.bb2: ; preds = %entry, %entry, %entry + tail call void @foo3(i32 noundef signext %x) + br label %return + +sw.bb3: ; preds = %entry + tail call void @foo4(i32 noundef signext 4) + br label %return + +return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry + %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ] + ret i32 %retval.0 +} + +declare void @foo1(i32 noundef signext) + +declare void @foo2(i32 noundef signext) + +declare void @foo3(i32 noundef signext) + +declare void @foo4(i32 noundef signext)