-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Use V_FMAC_F64 in "if (cond) a -= c" #168710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Valery Pykhtin (vpykhtin) ChangesThis converts: to saving one v_cndmask instruction per pattern. Only work on pre-gfx12 as gfx12+ has dual-issued This gives 5% improvement in SPEC ACCEL 450.md on MI350. Originally proposed by Michael Selehov. In assembly the transformation looks like: to: Since the resulting pattern has one more instruction and requires 7 VGPRs instead of 6 it's only beneficial to use if constants can be shared with other code or in a loop. For this reason the reverting pass has been implemented: it works after scheduler's rematerializer when it becomes clear if those constants are shared with other code. Reverting pass seem a bit of overkill to me, since it won't work most of the time it's an excellent place for errors. Probably we can only do this transform in loops and don't revert at all. Patch is 160.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168710.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 7f00eadbf3f3f..db7668f98dc30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -130,6 +130,20 @@ def sign_extension_in_reg : GICombineRule<
[{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
(apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
+def cond_sub_to_fma_matchdata : GIDefMatchData<"ConditionalSubToFMAMatchInfo">;
+
+// Optimize conditional subtraction patterns to FMA:
+// result = a - (cond ? c : 0.0) -> fma(select(cond, -1.0, 0.0), c, a)
+// result = a + (cond ? -c : 0.0) -> fma(select(cond, -1.0, 0.0), c, a)
+// result = a + (-(cond ? c : 0.0)) -> fma(select(cond, -1.0, 0.0), c, a)
+//
+// Only enabled for f64 when shouldUseConditionalSubToFMAF64() is true.
+def cond_sub_to_fma : GICombineRule<
+ (defs root:$fsub_or_fadd, cond_sub_to_fma_matchdata:$matchinfo),
+ (match (wip_match_opcode G_FSUB, G_FADD):$fsub_or_fadd,
+ [{ return matchConditionalSubToFMA(*${fsub_or_fadd}, ${matchinfo}); }]),
+ (apply [{ applyConditionalSubToFMA(*${fsub_or_fadd}, ${matchinfo}); }])>;
+
// Do the following combines :
// fmul x, select(y, A, B) -> fldexp (x, select i32 (y, a, b))
// fmul x, select(y, -A, -B) -> fldexp ((fneg x), select i32 (y, a, b))
@@ -228,7 +242,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
- binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
+ cond_sub_to_fma, binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index e86b4738bed18..96cea2a08b5c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
@@ -47,6 +48,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
const GCNSubtarget &STI;
const SIInstrInfo &TII;
+ const MachineLoopInfo *MLI;
// TODO: Make CombinerHelper methods const.
mutable AMDGPUCombinerHelper Helper;
@@ -56,7 +58,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
const GCNSubtarget &STI, MachineDominatorTree *MDT,
- const LegalizerInfo *LI);
+ const MachineLoopInfo *MLI, const LegalizerInfo *LI);
static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
@@ -113,6 +115,19 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
// bits are zero extended.
bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+ // Match conditional subtraction patterns for FMA optimization
+ struct ConditionalSubToFMAMatchInfo {
+ Register Cond;
+ Register C;
+ Register A;
+ };
+
+ bool matchConditionalSubToFMA(MachineInstr &MI,
+ ConditionalSubToFMAMatchInfo &MatchInfo) const;
+ void
+ applyConditionalSubToFMA(MachineInstr &MI,
+ const ConditionalSubToFMAMatchInfo &MatchInfo) const;
+
private:
#define GET_GICOMBINER_CLASS_MEMBERS
#define AMDGPUSubtarget GCNSubtarget
@@ -131,9 +146,10 @@ AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
- const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
+ const GCNSubtarget &STI, MachineDominatorTree *MDT,
+ const MachineLoopInfo *MLI, const LegalizerInfo *LI)
: Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),
- TII(*STI.getInstrInfo()),
+ TII(*STI.getInstrInfo()), MLI(MLI),
Helper(Observer, B, /*IsPreLegalize*/ false, &VT, MDT, LI, STI),
#define GET_GICOMBINER_CONSTRUCTOR_INITS
#include "AMDGPUGenPostLegalizeGICombiner.inc"
@@ -435,6 +451,112 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
return false;
}
+// Match conditional subtraction patterns for FMA optimization:
+// result = a - (cond ? c : 0.0)
+// result = a + (cond ? -c : 0.0)
+// result = a + (-(cond ? c : 0.0))
+// to convert to:
+// result = fma((cond ? -1.0, 0.0), c, a)
+//
+// This saves one v_cndmask per pattern on a pre-gfx12.
+// The optimization doesn't make sense on gfx12 due to dual-issued v_cndmask.
+bool AMDGPUPostLegalizerCombinerImpl::matchConditionalSubToFMA(
+ MachineInstr &MI, ConditionalSubToFMAMatchInfo &MatchInfo) const {
+ if (!MLI || !STI.shouldUseConditionalSubToFMAF64())
+ return false;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ if (Ty != LLT::scalar(64))
+ return false;
+
+ Register A = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ MachineInstr *RHSMI = MRI.getVRegDef(RHS);
+ if (!RHSMI)
+ return false;
+
+ // Returns true if SelMI is a valid select with false value = 0.0.
+ auto matchSelectWithZero = [this, &MI](MachineInstr *SelMI, Register &Cond,
+ Register &TrueVal) -> bool {
+ if (!SelMI || SelMI->getOpcode() != TargetOpcode::G_SELECT)
+ return false;
+
+ Register FalseVal = SelMI->getOperand(3).getReg();
+ auto FalseConst = getFConstantVRegValWithLookThrough(FalseVal, MRI);
+ if (!FalseConst || !FalseConst->Value.isExactlyValue(0.0))
+ return false;
+
+ // Check if TrueVal is not constant.
+ // FIXME: This should work but currently generates incorrect code.
+ auto TempTrueVal = SelMI->getOperand(2).getReg();
+ auto TrueConst = getAnyConstantVRegValWithLookThrough(TempTrueVal, MRI);
+ if (TrueConst)
+ return false;
+
+ // Check if select and the add/sub are in the same loop context.
+ // Skipping optimization for loop-invariant select because it has no benefit
+ // in the loop: f_add is cheaper and requires less registers.
+ if (MLI->getLoopFor(MI.getParent()) != MLI->getLoopFor(SelMI->getParent()))
+ return false;
+
+ TrueVal = TempTrueVal;
+ Cond = SelMI->getOperand(1).getReg();
+ return true;
+ };
+
+ Register Cond, C;
+ if (MI.getOpcode() == TargetOpcode::G_FSUB) {
+ // Pattern: fsub a, (select cond, c, 0.0)
+ if (matchSelectWithZero(RHSMI, Cond, C)) {
+ MatchInfo = {Cond, C, A};
+ return true;
+ }
+ } else if (MI.getOpcode() == TargetOpcode::G_FADD) {
+ // Pattern 1: fadd a, (fneg (select cond, c, 0.0))
+ if (RHSMI->getOpcode() == TargetOpcode::G_FNEG) {
+ Register SelReg = RHSMI->getOperand(1).getReg();
+ MachineInstr *SelMI = MRI.getVRegDef(SelReg);
+ if (matchSelectWithZero(SelMI, Cond, C)) {
+ MatchInfo = {Cond, C, A};
+ return true;
+ }
+ }
+
+ // Pattern 2: fadd a, (select cond, (fneg c), 0.0)
+ if (matchSelectWithZero(RHSMI, Cond, C)) {
+ // Check if C is fneg
+ MachineInstr *CMI = MRI.getVRegDef(C);
+ if (CMI && CMI->getOpcode() == TargetOpcode::G_FNEG) {
+ C = CMI->getOperand(1).getReg();
+ MatchInfo = {Cond, C, A};
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyConditionalSubToFMA(
+ MachineInstr &MI, const ConditionalSubToFMAMatchInfo &MatchInfo) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(Dst);
+
+ // Build: correction = select cond, -1.0, 0.0
+ APFloat MinusOne = APFloat(-1.0);
+ APFloat Zero = APFloat(0.0);
+
+ Register MinusOneReg = B.buildFConstant(Ty, MinusOne).getReg(0);
+ Register ZeroReg = B.buildFConstant(Ty, Zero).getReg(0);
+ Register Correction =
+ B.buildSelect(Ty, MatchInfo.Cond, MinusOneReg, ZeroReg).getReg(0);
+
+ // Build: result = fma(correction, c, a)
+ B.buildFMA(Dst, Correction, MatchInfo.C, MatchInfo.A, MI.getFlags());
+
+ MI.eraseFromParent();
+}
+
// Pass boilerplate
// ================
@@ -467,6 +589,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
if (!IsOptNone) {
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addPreserved<MachineLoopInfoWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -494,6 +618,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
MachineDominatorTree *MDT =
IsOptNone ? nullptr
: &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachineLoopInfo *MLI =
+ IsOptNone ? nullptr : &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
@@ -503,7 +629,7 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
// Legalizer performs DCE, so a full DCE pass is unnecessary.
CInfo.EnableFullDCE = false;
AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, /*CSEInfo*/ nullptr,
- RuleConfig, ST, MDT, LI);
+ RuleConfig, ST, MDT, MLI, LI);
return Impl.combineMachineInstrs();
}
@@ -513,6 +639,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
"Combine AMDGPU machine instrs after legalization", false,
false)
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 62172a0bb89db..d00d9a49f5a09 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -37,6 +37,7 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -45,17 +46,33 @@ using namespace llvm;
namespace {
+static bool isImmConstant(const MachineOperand &Op, int64_t Val) {
+ return Op.isImm() && Op.getImm() == Val;
+}
+
class GCNPreRAOptimizationsImpl {
private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ MachineLoopInfo *MLI;
bool processReg(Register Reg);
+ bool isSingleUseVReg(Register Reg) const {
+ return Reg.isVirtual() && MRI->hasOneUse(Reg);
+ }
+
+ bool isConstMove(MachineInstr &MI, int64_t C) const {
+ return TII->isFoldableCopy(MI) && isImmConstant(MI.getOperand(1), C);
+ }
+
+ bool revertConditionalFMAPattern(MachineInstr &FMAInstr);
+
public:
- GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
+ GCNPreRAOptimizationsImpl(LiveIntervals *LS, MachineLoopInfo *MLI)
+ : LIS(LS), MLI(MLI) {}
bool run(MachineFunction &MF);
};
@@ -75,6 +92,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -84,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
"AMDGPU Pre-RA optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
"Pre-RA optimizations", false, false)
@@ -229,14 +248,16 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
- return GCNPreRAOptimizationsImpl(LIS).run(MF);
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ return GCNPreRAOptimizationsImpl(LIS, MLI).run(MF);
}
PreservedAnalyses
GCNPreRAOptimizationsPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
- GCNPreRAOptimizationsImpl(LIS).run(MF);
+ MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+ GCNPreRAOptimizationsImpl(LIS, MLI).run(MF);
return PreservedAnalyses::all();
}
@@ -260,6 +281,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (ST.shouldUseConditionalSubToFMAF64()) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
+ if (MI.getOpcode() == AMDGPU::V_FMAC_F64_e32)
+ Changed |= revertConditionalFMAPattern(MI);
+ }
+ }
+ }
+
if (!ST.useRealTrue16Insts())
return Changed;
@@ -295,3 +325,178 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
return Changed;
}
+
+/// Revert conditional subtraction to conditional FMA optimization happened
+/// earlier in the selector. The reason is that the optimization uses more
+/// instructions and registers to hold constants than original pattern and after
+/// rematerializer it becomes clear if those constants are shared with other
+/// code.
+///
+/// Detects a pattern where an FMA is used to conditionally subtract a value:
+/// FMA(dst, cond ? -1.0 : 0.0, value, accum) -> accum - (cond ? value : 0)
+///
+/// Pattern detected:
+/// v_mov_b32_e32 vNegOneHi, 0xbff00000 ; -1.0 high bits (single use)
+/// v_mov_b32_e32 vMul.lo, 0 ; (single use)
+/// v_cndmask_b32_e64 vMul.hi, 0, vNegOneHi, vCondReg ; (single use)
+/// v_fmac_f64_e32 vDst[0:1], vMul[0:1], vValue[0:1], vAccum[0:1]
+///
+/// Transformed to (3 instructions instead of 4, lower register pressure):
+/// v_cndmask_b32_e64 vCondValue.lo, 0, vValue.lo, vCondReg
+/// v_cndmask_b32_e64 vCondValue.hi, 0, vValue.hi, vCondReg
+/// v_add_f64_e64 vDst[0:1], vAccum[0:1], -vCondValue[0:1]
+///
+/// For loops: if both constants are initialized before the loop where the
+/// v_fmac resides, we keep the original pattern. Ignoring case when v_fmac and
+/// v_cndmask aren't in the same loop context as the selector doesn't generate
+/// the pattern if v_cndmask is loop invariant.
+bool GCNPreRAOptimizationsImpl::revertConditionalFMAPattern(
+ MachineInstr &FMAInstr) {
+ assert(FMAInstr.getOpcode() == AMDGPU::V_FMAC_F64_e32);
+
+ MachineOperand *MulOp = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src0);
+ assert(MulOp);
+ if (!MulOp->isReg() || !isSingleUseVReg(MulOp->getReg()))
+ return false;
+
+ // Find subregister definitions for the 64-bit multiplicand register
+ MachineInstr *MulLoDefMI = nullptr;
+ MachineInstr *MulHiDefMI = nullptr;
+
+ for (auto &DefMI : MRI->def_instructions(MulOp->getReg())) {
+ if (DefMI.getOperand(0).getSubReg() == AMDGPU::sub0) {
+ MulLoDefMI = &DefMI;
+ } else if (DefMI.getOperand(0).getSubReg() == AMDGPU::sub1) {
+ MulHiDefMI = &DefMI;
+ }
+ }
+
+ if (!MulLoDefMI || !isConstMove(*MulLoDefMI, 0))
+ return false;
+
+ if (!MulHiDefMI || MulHiDefMI->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+ return false;
+
+ MachineInstr *CndMaskMI = MulHiDefMI;
+ MachineOperand *CndMaskFalseOp =
+ TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src0);
+ assert(CndMaskFalseOp);
+ if (!isImmConstant(*CndMaskFalseOp, 0))
+ return false;
+
+ MachineOperand *CndMaskTrueOp =
+ TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src1);
+ assert(CndMaskTrueOp);
+ if (!isSingleUseVReg(CndMaskTrueOp->getReg()))
+ return false;
+
+ // Check that the true operand is -1.0's high 32 bits (0xbff00000)
+ MachineOperand *NegOneHiDef = MRI->getOneDef(CndMaskTrueOp->getReg());
+ if (!NegOneHiDef ||
+ !isConstMove(*NegOneHiDef->getParent(), -1074790400 /* 0xbff00000 */))
+ return false;
+
+ MachineInstr *NegOneHiMovMI = NegOneHiDef->getParent();
+
+ if (MachineLoop *L = MLI->getLoopFor(FMAInstr.getParent())) {
+ // The selector skips optimization if 'select' is loop invariant, so this is
+ // more like an assert.
+ if (MLI->getLoopFor(CndMaskMI->getParent()) != L)
+ return false;
+
+ // If both constants are initialized before the loop it's still beneficial
+ // to keep the pattern.
+ if (MLI->getLoopFor(NegOneHiMovMI->getParent()) != L &&
+ MLI->getLoopFor(MulLoDefMI->getParent()) != L)
+ return false;
+ }
+
+ // Perform the revert
+ auto *DstOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::vdst);
+ auto *ValueOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src1);
+ auto *AccumOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src2);
+ auto *CondOpnd = TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src2);
+ assert(DstOpnd && ValueOpnd && AccumOpnd && CondOpnd);
+
+ Register DstReg = DstOpnd->getReg();
+ Register ValueReg = ValueOpnd->getReg();
+ Register AccumReg = AccumOpnd->getReg();
+ Register CondReg = CondOpnd->getReg();
+
+ // Create a new 64-bit register for the conditional value
+ Register CondValueReg =
+ MRI->createVirtualRegister(MRI->getRegClass(ValueReg));
+
+ MachineBasicBlock::iterator InsertPt = FMAInstr.getIterator();
+ DebugLoc DL = FMAInstr.getDebugLoc();
+
+ // Build: vCondValue.lo = condition ? vValue.lo : 0
+ MachineBasicBlock *MBB = FMAInstr.getParent();
+ MachineInstr *SelLo =
+ BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addReg(CondValueReg, RegState::DefineNoRead, AMDGPU::sub0)
+ .addImm(0) // src0_modifiers
+ .addImm(0) // src0 (false value = 0)
+ .addImm(0) // src1_modifiers
+ .addReg(ValueReg, 0, AMDGPU::sub0) // src1 (true value = vValue.lo)
+ .addReg(CondReg) // condition
+ .getInstr();
+
+ // Build: vCondValue.hi = condition ? vValue.hi : 0
+ MachineInstr *SelHi =
+ BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addReg(CondValueReg, RegState::Define, AMDGPU::sub1)
+ .addImm(0) // src0_modifiers
+ .addImm(0) // src0 (false value = 0)
+ .addImm(0) // src1_modifiers
+ .addReg(ValueReg, 0, AMDGPU::sub1) // src1 (true value = vValue.hi)
+ .addReg(CondReg) // condition
+ .getInstr();
+
+ // Build: vDst = vAccum - vCondValue (negation via src1_modifiers bit)
+ MachineInstr *Sub =
+ BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_ADD_F64_e64))
+ .addReg(DstReg, RegState::Define)
+ .addImm(0) // src0_modifiers
+ .addReg(AccumReg) // src0 (accumulator)
+ .addImm(1) // src1_modifiers (negation bit)
+ .addReg(CondValueReg) // src1 (negated conditional value)
+ .addImm(0) // clamp
+ .addImm(0) // omod
+ .getInstr();
+
+ // Delete the old instructions
+ for (MachineInstr *MI : {&FMAInstr, MulLoDefMI, CndMaskMI, NegOneHiMovMI}) {
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/SIISelLowering.cppThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
🐧 Linux x64 Test Results
|
This converts:
to
result = fma((cond ? -1.0 : 0.0), c, a)saving one v_cndmask instruction per pattern. Only work on pre-gfx12 as gfx12+ has dual-issued
v_cndmask.This gives 5% improvement in SPEC ACCEL 450.md on MI350. Originally proposed by Michael Selehov.
In assembly the transformation looks like:
to:
Since the resulting pattern has one more instruction and requires 7 VGPRs instead of 6 it's only beneficial to use if constants can be shared with other code or in a loop. For this reason the reverting pass has been implemented: it works after scheduler's rematerializer when it becomes clear if those constants are shared with other code.
Reverting pass seem a bit of overkill to me, since it won't work most of the time it's an excellent place for errors. Probably we can only do this transform in loops and don't revert at all.