[X86] Set up the framework for optimization of CCMP/CTEST #84603

KanRobert · 2024-03-09T05:02:15Z

SPEC: https://cdrdv2.intel.com/v1/dl/getContent/784266

Blog: https://kanrobert.github.io/rfc/All-about-APX-conditional-ISA

llvmbot · 2024-03-09T05:02:45Z

@llvm/pr-subscribers-backend-x86

Author: Shengchen Kan (KanRobert)

Changes

SPEC: https://cdrdv2.intel.com/v1/dl/getContent/784266

Blog: https://kanrobert.github.io/rfc/All-about-APX-conditional-ISA

Patch is 126.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/84603.diff

8 Files Affected:

(modified) llvm/lib/Target/X86/CMakeLists.txt (+1)
(modified) llvm/lib/Target/X86/X86.h (+4)
(added) llvm/lib/Target/X86/X86ConditionalCompares.cpp (+887)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3-1)
(modified) llvm/lib/Target/X86/X86TargetMachine.cpp (+2)
(added) llvm/test/CodeGen/X86/apx/ccmp.ll (+1116)
(added) llvm/test/CodeGen/X86/apx/ctest.ll (+1212)
(modified) llvm/test/CodeGen/X86/opt-pipeline.ll (+1)

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 610999f0cc3cf0..17fd0a004147aa 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -30,6 +30,7 @@ set(sources
   X86CallingConv.cpp
   X86CmovConversion.cpp
   X86CodeGenPassBuilder.cpp
+  X86ConditionalCompares.cpp
   X86DomainReassignment.cpp
   X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 21623a805f5568..042704355704ac 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -157,6 +157,9 @@ FunctionPass *createX86InsertX87waitPass();
 /// ways.
 FunctionPass *createX86PartialReductionPass();
 
+/// This pass performs CCMP optimization.
+FunctionPass *createX86ConditionalCompares();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
@@ -194,6 +197,7 @@ void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
 void initializeX86LowerTileCopyPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
 void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86ConditionalComparesPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86ReturnThunksPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86ConditionalCompares.cpp b/llvm/lib/Target/X86/X86ConditionalCompares.cpp
new file mode 100644
index 00000000000000..3e7c9d0e3d48f0
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ConditionalCompares.cpp
@@ -0,0 +1,887 @@
+//==========-- X86ConditionalCompares.cpp --- CCMP formation for X86 -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86ConditionalCompares pass which reduces
+// branching by using the conditional compare instructions CCMP, CTEST.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "x86-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumMultEFLAGSUses, "Number of ccmps rejected (EFLAGS used)");
+STATISTIC(NumUnknEFLAGSDefs, "Number of ccmps rejected (EFLAGS def unknown)");
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The X86 conditional compare instructions have an operand that
+// specifies the conditional flags to set values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+// void f(int a, int b) {
+//   if (a == 5 || b == 17)
+//     foo();
+// }
+//
+//    Head:
+//      cmpl  $5, $edi
+//      je Tail
+//    CmpBB:
+//      cmpl  $17, $esi
+//      je Tail
+//    ...
+//    Tail:
+//      call foo
+//
+//  Becomes:
+//
+//    Head:
+//      cmpl  $5, $edi
+//      ccmpel {dfv=zf} $17, $edi
+//      je Tail
+//    ...
+//    Tail:
+//      call foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const X86Subtarget *STI;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  const MachineBranchProbabilityInfo *MBPI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a successor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by analyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  X86::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  X86::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF,
+                            const MachineBranchProbabilityInfo *MBPI) {
+    this->MF = &MF;
+    this->MBPI = MBPI;
+    STI = &MF.getSubtarget<X86Subtarget>();
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      Register Reg = I.getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned Idx = I.getNumOperands(); Idx > 2; Idx -= 2) {
+      // PHI operands are (Reg, MBB) at (Idx-2, Idx-1).
+      if (I.getOperand(Idx - 1).getMBB() == CmpBB) {
+        I.removeOperand(Idx - 1);
+        I.removeOperand(Idx - 2);
+      }
+    }
+  }
+}
+
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  if (!Register::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(X86::EFLAGS)) {
+    ++NumCmpTermRejs;
+    LLVM_DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return nullptr;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    I = prev_nodbg(I, MBB->begin());
+    assert(!I->isTerminator() && "Spurious terminator");
+
+    switch (I->getOpcode()) {
+    // This pass run before peephole optimization, so the SUB has not been
+    // optimized to CMP yet.
+    case X86::SUB8rr:
+    case X86::SUB16rr:
+    case X86::SUB32rr:
+    case X86::SUB64rr:
+    case X86::SUB8ri:
+    case X86::SUB16ri:
+    case X86::SUB32ri:
+    case X86::SUB64ri32:
+    case X86::SUB8rr_ND:
+    case X86::SUB16rr_ND:
+    case X86::SUB32rr_ND:
+    case X86::SUB64rr_ND:
+    case X86::SUB8ri_ND:
+    case X86::SUB16ri_ND:
+    case X86::SUB32ri_ND:
+    case X86::SUB64ri32_ND: {
+      if (!isDeadDef(I->getOperand(0).getReg()))
+        return nullptr;
+      return STI->hasCCMP() ? &*I : nullptr;
+    }
+    case X86::CMP8rr:
+    case X86::CMP16rr:
+    case X86::CMP32rr:
+    case X86::CMP64rr:
+    case X86::CMP8ri:
+    case X86::CMP16ri:
+    case X86::CMP32ri:
+    case X86::CMP64ri32:
+    case X86::TEST8rr:
+    case X86::TEST16rr:
+    case X86::TEST32rr:
+    case X86::TEST64rr:
+    case X86::TEST8ri:
+    case X86::TEST16ri:
+    case X86::TEST32ri:
+    case X86::TEST64ri32:
+      return STI->hasCCMP() ? &*I : nullptr;
+    default:
+      break;
+    }
+
+    // Check for flag reads and clobbers.
+    PhysRegInfo PRI = AnalyzePhysRegInBundle(*I, X86::EFLAGS, TRI);
+
+    if (PRI.Read) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      LLVM_DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultEFLAGSUses;
+      return nullptr;
+    }
+
+    if (PRI.Defined || PRI.Clobbered) {
+      LLVM_DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknEFLAGSDefs;
+      return nullptr;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB)
+                    << '\n');
+  return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's very hard to get right.
+  if (!MBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugInstr())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I.isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't hoist: " << I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I.mayLoad()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate load: " << I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+      LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
+      return false;
+    }
+
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(X86::EFLAGS, TRI)) {
+      LLVM_DEBUG(dbgs() << "Clobbers flags: " << I);
+      return false;
+    }
+  }
+  return true;
+}
+
+// Parse a condition code returned by analyzeBranch, and compute the CondCode
+// corresponding to TBB.
+static bool parseCond(ArrayRef<MachineOperand> Cond, X86::CondCode &CC,
+                      ArrayRef<X86::CondCode> UnsupportedCCs = {}) {
+  if (Cond.size() != 1)
+    return false;
+
+  CC = static_cast<X86::CondCode>(Cond[0].getImm());
+
+  for (const auto &UnsupportedCC : UnsupportedCCs) {
+    if (CC == UnsupportedCC)
+      return false;
+  }
+
+  return CC != X86::COND_INVALID;
+}
+
+static unsigned getNumOfJcc(const MachineBasicBlock *MBB) {
+  unsigned NumOfJcc = 0;
+  for (auto It = MBB->rbegin(); It != MBB->rend(); ++It) {
+    if (!It->isTerminator())
+      return NumOfJcc;
+    if (It->getOpcode() == X86::JCC_1)
+      ++NumOfJcc;
+  }
+  return NumOfJcc;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
+                    << printMBBReference(*CmpBB) << " -> "
+                    << printMBBReference(*Tail) << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    LLVM_DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    LLVM_DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    LLVM_DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // CCMP/CTEST resets all the bits of EFLAGS
+  unsigned NumOfJcc = getNumOfJcc(Head);
+  if (NumOfJcc > 1) {
+    LLVM_DEBUG(dbgs() << "More than one Jcc in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "analyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC, {X86::COND_P, X86::COND_NP})) {
+    LLVM_DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = X86::GetOppositeBranchCondition(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = nullptr;
+  if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    LLVM_DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    LLVM_DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = X86::GetOppositeBranchCondition(CmpBBTailCC);
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+
+  return true;
+}
+
+static int getCondFlagsFromCondCode(X86::CondCode CC) {
+  // CCMP/CTEST has two conditional operands:
+  // - SCC: source conditonal code (same a...
[truncated]

github-actions · 2024-03-09T05:04:46Z

✅ With the latest revision this PR passed the C/C++ code formatter.

goldsteinn · 2024-03-09T05:16:54Z

llvm/lib/Target/X86/X86ISelLowering.cpp

-  int BaseCost = BrMergingBaseCostThresh.getValue();
+  // Disable condition merging when CCMP is available b/c we can eliminate
+  // branches in a more efficient way.
+  int BaseCost = Subtarget.hasCCMP() ? -1 : BrMergingBaseCostThresh.getValue();


Would have figured it would be easier to just lower (and/or setcc, setcc) as cmpp than essentially restitching the BBs at the MBB level (which seems to have some extra restricts IIUC canConvert correctly).

Also note, if we are going to merge the BBs anyways, probably best to do as soon as possible so we have more context during DAG lowering.

@goldsteinn We can't lower (and/or setcc, setcc) as ccmp b/c CCMP has a CMP semantic. Instead, what we need to need to do is sth like

Combine the DAG

// sub(and(setcc(cc1, ...), setcc(cc2, sub (X, Y))), 1) // brcond ne // // -> // // ccmp(X, Y, cc1, cc2) // brcond cc2 // // if only flag has users.

Define fragment for ccmp

add pattern for CCMP using the fragment

I drafted a patch locally and found that it's not simpler than the MIR pass.

IIUC, some checks in canConvert are conservative and not real restrictions. It should not have more restriction than DAGCombine + pattern match approach. The function is borrowed from AArch64ConditionalCompares.cpp,

probably best to do as soon as possible so we have more context during DAG lowering

Which context you think we need? AArch64ConditionalCompares.cpp was designed to run
immediately before the early if-conversion pass.

I will upload the DAGCombine + pattern match solution too for comparison.

@goldsteinn We can't lower (and/or setcc, setcc) as ccmp b/c CCMP has a CMP semantic. Instead, what we need to need to do is sth like

Combine the DAG

// sub(and(setcc(cc1, ...), setcc(cc2, sub (X, Y))), 1) // brcond ne // // -> // // ccmp(X, Y, cc1, cc2) // brcond cc2 // // if only flag has users.

Define fragment for ccmp

add pattern for CCMP using the fragment

I drafted a patch locally and found that it's not simpler than the MIR pass.

Is that #91747?
If so I disagree that it is not simpler... re-patching BBs in the MachineInst phase
is definitely a large source of the complexity of this patch.

IIUC, some checks in canConvert are conservative and not real restrictions. It should not have more restriction than DAGCombine + pattern match approach. The function is borrowed from AArch64ConditionalCompares.cpp,

probably best to do as soon as possible so we have more context during DAG lowering

Which context you think we need? AArch64ConditionalCompares.cpp was designed to run immediately before the early if-conversion pass.

What I mean is current DAGCombiner runs on BBs, so if are going to be merging BBs, doing so earlier
will improve the rest of lowering.

Based on the two missing cases in #91747, can they not be fixed on in simplifyCFG?

Is that #91747?
Yes.

Based on the two missing cases in #91747, can they not be fixed on in simplifyCFG?

For the case that two compares are not in same BB, maybe we can fix it simplifyCFG.
For the case that DAGCombine only revisit the direct users after simplifying a node, we can only expect #91772 or similar PRs. TBH, I don't known when it can be ready.

llvm/lib/Target/X86/X86ConditionalCompares.cpp

KanRobert · 2024-05-10T18:34:27Z

Compared with this implementation, DAGCombine + pattern match approach can not handle two cases at least. See comments at #91747

I believe we should add this MIR pass and handle the missed cases with DAGCombine + pattern match approach if we find any in real world.

KanRobert · 2024-05-10T18:41:12Z

Ping @goldsteinn @phoebewang @FreddyLeaf @e-kud ?

llvm/lib/Target/X86/X86ConditionalCompares.cpp

SPEC: https://cdrdv2.intel.com/v1/dl/getContent/784266 Blog: https://kanrobert.github.io/rfc/All-about-APX-conditional-ISA

KanRobert · 2024-05-12T05:11:14Z

Based on the reviewer's opinion, #91747 is preferred.

llvmbot added the backend:X86 label Mar 9, 2024

KanRobert requested review from RKSimon, phoebewang, e-kud, goldsteinn and XinWang10 March 9, 2024 05:03

KanRobert requested a review from topperc March 9, 2024 05:09

goldsteinn reviewed Mar 9, 2024

View reviewed changes

llvm/lib/Target/X86/X86ConditionalCompares.cpp Show resolved Hide resolved

e-kud reviewed Mar 15, 2024

View reviewed changes

llvm/lib/Target/X86/X86ConditionalCompares.cpp Show resolved Hide resolved

FreddyLeaf reviewed Mar 15, 2024

View reviewed changes

llvm/lib/Target/X86/X86ConditionalCompares.cpp Outdated Show resolved Hide resolved

KanRobert force-pushed the ccmp-opt branch from 28dc1ae to 24c2f50 Compare May 8, 2024 19:57

e-kud reviewed May 11, 2024

View reviewed changes

llvm/lib/Target/X86/X86ConditionalCompares.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/X86/X86ConditionalCompares.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/X86/X86ConditionalCompares.cpp Outdated Show resolved Hide resolved

KanRobert added 8 commits May 12, 2024 12:53

[X86] Set up the framework for optimization of CCMP/CTEST

386adce

SPEC: https://cdrdv2.intel.com/v1/dl/getContent/784266 Blog: https://kanrobert.github.io/rfc/All-about-APX-conditional-ISA

fix failure after rebase

ef7f8a8

address review comments

f62a76e

Move def to X86InstrInfo.h/cpp for sharing

b06245b

add remark

a1f318d

add remark

025a7ea

Update test

f722e16

Update test after e586556

e1e5d52

KanRobert force-pushed the ccmp-opt branch from 1e982d9 to e1e5d52 Compare May 12, 2024 05:05

KanRobert closed this May 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86] Set up the framework for optimization of CCMP/CTEST #84603

[X86] Set up the framework for optimization of CCMP/CTEST #84603

KanRobert commented Mar 9, 2024

llvmbot commented Mar 9, 2024

github-actions bot commented Mar 9, 2024 •

edited

goldsteinn Mar 9, 2024

KanRobert Mar 9, 2024

goldsteinn Mar 9, 2024

KanRobert May 8, 2024 •

edited

KanRobert May 8, 2024

goldsteinn May 11, 2024

KanRobert May 12, 2024

KanRobert commented May 10, 2024 •

edited

KanRobert commented May 10, 2024

KanRobert commented May 12, 2024

[X86] Set up the framework for optimization of CCMP/CTEST #84603

[X86] Set up the framework for optimization of CCMP/CTEST #84603

Conversation

KanRobert commented Mar 9, 2024

llvmbot commented Mar 9, 2024

github-actions bot commented Mar 9, 2024 • edited

goldsteinn Mar 9, 2024

Choose a reason for hiding this comment

KanRobert Mar 9, 2024

Choose a reason for hiding this comment

goldsteinn Mar 9, 2024

Choose a reason for hiding this comment

KanRobert May 8, 2024 • edited

Choose a reason for hiding this comment

KanRobert May 8, 2024

Choose a reason for hiding this comment

goldsteinn May 11, 2024

Choose a reason for hiding this comment

KanRobert May 12, 2024

Choose a reason for hiding this comment

KanRobert commented May 10, 2024 • edited

KanRobert commented May 10, 2024

KanRobert commented May 12, 2024

github-actions bot commented Mar 9, 2024 •

edited

KanRobert May 8, 2024 •

edited

KanRobert commented May 10, 2024 •

edited