[AMDGPU][GISel] Fold 'min(min(x,y),z)' and 'max(max(x,y),z)' into min3 and max3#200410
[AMDGPU][GISel] Fold 'min(min(x,y),z)' and 'max(max(x,y),z)' into min3 and max3#200410xiongzile wants to merge 2 commits into
Conversation
|
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-amdgpu Author: Elio (xiongzile) ChangesOriginal PR: #124263 Do the optimization in the pre-legalizer phase. Patch is 23.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/200410.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index bc6b5df99d2e7..fb1bff27f7e8b 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -455,7 +455,6 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
if (!InstrMapping.isValid())
return MappingCost::ImpossibleCost();
-
// If mapped with InstrMapping, MI will have the recorded cost.
MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent())
: BlockFrequency(1));
@@ -597,12 +596,12 @@ bool RegBankSelect::applyMapping(
SmallVectorImpl<RegBankSelect::RepairingPlacement> &RepairPts) {
// OpdMapper will hold all the information needed for the rewriting.
std::optional<RegisterBankInfo::OperandsMapper> OpdMapper;
-
// First, place the repairing code.
for (RepairingPlacement &RepairPt : RepairPts) {
if (!RepairPt.canMaterialize() ||
RepairPt.getKind() == RepairingPlacement::Impossible)
return false;
+
assert(RepairPt.getKind() != RepairingPlacement::None &&
"This should not make its way in the list");
unsigned OpIdx = RepairPt.getOpIdx();
@@ -642,7 +641,6 @@ bool RegBankSelect::applyMapping(
LLVM_DEBUG(dbgs() << "Actual mapping of the operands: " << *OpdMapper
<< '\n');
RBI->applyMapping(MIRBuilder, *OpdMapper);
-
return true;
}
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
index 1049aa979ce81..29afe727b6bf1 100644
--- a/llvm/lib/CodeGen/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -407,6 +407,7 @@ RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
+
llvm_unreachable("The target must implement this");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a2e6e6f448e8f..dea1f417e4149 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -157,6 +157,23 @@ def zext_of_shift_amount_combines : GICombineGroup<[
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
]>;
+def minmax3_matchdata : GIDefMatchData<"VOP3MatchInfo">;
+
+class minmax_to_minmax3_opcodes<Instruction minmaxOpcode> : GICombineRule<
+ (defs root:$min_or_max, minmax3_matchdata:$matchinfo),
+ (match (minmaxOpcode $dst, $lhs, $rhs):$min_or_max,
+ [{ return matchMinMaxToMinMax3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ applyVOP3(*${min_or_max}, ${matchinfo}); }])>;
+
+def smax_to_minmax3 : minmax_to_minmax3_opcodes<G_SMAX>;
+def smin_to_minmax3 : minmax_to_minmax3_opcodes<G_SMIN>;
+def umax_to_minmax3 : minmax_to_minmax3_opcodes<G_UMAX>;
+def umin_to_minmax3 : minmax_to_minmax3_opcodes<G_UMIN>;
+def fmax_to_minmax3 : minmax_to_minmax3_opcodes<G_FMAXNUM>;
+def fmin_to_minmax3 : minmax_to_minmax3_opcodes<G_FMINNUM>;
+def fmax_ieee_to_minmax3 : minmax_to_minmax3_opcodes<G_FMAXNUM_IEEE>;
+def fmin_ieee_to_minmax3 : minmax_to_minmax3_opcodes<G_FMINNUM_IEEE>;
+
// (and/or i64:x, i64:y) -> i64:(merge (and/or lo_32(x), lo_32(y)), (and/or hi_32(x), hi_32(y)))
// when either x or y is all ones in low or high parts
class combine_binop_s64_with_s32_mask<Instruction opcode> : GICombineRule<
@@ -219,7 +236,9 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector, combine_shuffle_vector_to_build_vector,
- binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32, smax_to_minmax3,
+ smin_to_minmax3, umax_to_minmax3, umin_to_minmax3, fmax_to_minmax3,
+ fmin_to_minmax3, fmax_ieee_to_minmax3, fmin_ieee_to_minmax3]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 51a8a476bbf7e..97957f5100067 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,6 +271,12 @@ def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
+def : GINodeEquiv<G_AMDGPU_SMAX3, AMDGPUsmax3>;
+def : GINodeEquiv<G_AMDGPU_UMAX3, AMDGPUumax3>;
+def : GINodeEquiv<G_AMDGPU_FMAX3, AMDGPUfmax3>;
+def : GINodeEquiv<G_AMDGPU_SMIN3, AMDGPUsmin3>;
+def : GINodeEquiv<G_AMDGPU_UMIN3, AMDGPUumin3>;
+def : GINodeEquiv<G_AMDGPU_FMIN3, AMDGPUfmin3>;
def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 67c0bdd35f367..70aeb80bbeace 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -16,6 +16,7 @@
#include "AMDGPULegalizerInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
@@ -23,8 +24,12 @@
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/Target/TargetMachine.h"
#define GET_GICOMBINER_DEPS
@@ -66,6 +71,11 @@ class AMDGPUPreLegalizerCombinerImpl : public Combiner {
Register Origin;
};
+ struct VOP3MatchInfo {
+ unsigned Opc;
+ Register Val0, Val1, Val2;
+ };
+
bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI,
const MachineFunction &MF,
ClampI64ToI16MatchInfo &MatchInfo) const;
@@ -73,6 +83,10 @@ class AMDGPUPreLegalizerCombinerImpl : public Combiner {
void applyClampI64ToI16(MachineInstr &MI,
const ClampI64ToI16MatchInfo &MatchInfo) const;
+ bool matchMinMaxToMinMax3(MachineInstr &MI, VOP3MatchInfo &MatchInfo) const;
+
+ void applyVOP3(MachineInstr &MI, VOP3MatchInfo &MatchInfo) const;
+
private:
#define GET_GICOMBINER_CLASS_MEMBERS
#define AMDGPUSubtarget GCNSubtarget
@@ -106,6 +120,66 @@ bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
return false;
}
+static bool matchVOP3(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned op,
+ Register &r0, Register &r1, Register &r2) {
+ auto p1 =
+ m_BinOp(op, m_OneNonDBGUse(m_BinOp(op, m_Reg(r0), m_Reg(r1))), m_Reg(r2));
+ auto p2 =
+ m_BinOp(op, m_Reg(r0), m_OneNonDBGUse(m_BinOp(op, m_Reg(r1), m_Reg(r2))));
+
+ return mi_match(MI, MRI, m_any_of(p1, p2));
+}
+
+static unsigned getMinMax3(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unsupported opcode");
+ case AMDGPU::G_SMAX:
+ return AMDGPU::G_AMDGPU_SMAX3;
+ case AMDGPU::G_SMIN:
+ return AMDGPU::G_AMDGPU_SMIN3;
+ case AMDGPU::G_UMAX:
+ return AMDGPU::G_AMDGPU_UMAX3;
+ case AMDGPU::G_UMIN:
+ return AMDGPU::G_AMDGPU_UMIN3;
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMAXNUM_IEEE:
+ return AMDGPU::G_AMDGPU_FMAX3;
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ return AMDGPU::G_AMDGPU_FMIN3;
+ }
+}
+
+void AMDGPUPreLegalizerCombinerImpl::applyVOP3(MachineInstr &MI,
+ VOP3MatchInfo &MatchInfo) const {
+ B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
+ {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags());
+ MI.eraseFromParent();
+ return;
+}
+
+bool AMDGPUPreLegalizerCombinerImpl::matchMinMaxToMinMax3(
+ MachineInstr &MI, VOP3MatchInfo &MatchInfo) const {
+ Register dst = MI.getOperand(0).getReg();
+ LLT t = MRI.getType(dst);
+ if (t == LLT::scalar(16)) {
+ if (!STI.hasMin3Max3_16()) {
+ return false;
+ }
+ } else if (t != LLT::scalar(32)) {
+ return false;
+ }
+
+ Register R0, R1, R2;
+ unsigned opc = MI.getOpcode();
+ if (!matchVOP3(MI, MRI, opc, R0, R1, R2)) {
+ return false;
+ }
+ MatchInfo = {getMinMax3(opc), R0, R1, R2};
+ return true;
+}
+
bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
ClampI64ToI16MatchInfo &MatchInfo) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a24df782cf28a..e4a037dc9af83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -75,6 +75,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -83,6 +84,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/RegisterBank.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include <cassert>
#define GET_TARGET_REGBANK_IMPL
#include "AMDGPUGenRegisterBank.inc"
@@ -3959,6 +3961,28 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
+ case AMDGPU::G_AMDGPU_SMAX3:
+ case AMDGPU::G_AMDGPU_SMIN3:
+ case AMDGPU::G_AMDGPU_UMAX3:
+ case AMDGPU::G_AMDGPU_UMIN3:
+ case AMDGPU::G_AMDGPU_FMAX3:
+ case AMDGPU::G_AMDGPU_FMIN3: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ assert(Size == 32);
+ auto op1 = getRegBankID(MI.getOperand(1).getReg(), MRI,
+ AMDGPU::VCCRegBankID);
+ auto op2 = getRegBankID(MI.getOperand(2).getReg(), MRI,
+ AMDGPU::VCCRegBankID);
+ auto op3 = getRegBankID(MI.getOperand(3).getReg(), MRI,
+ AMDGPU::VCCRegBankID);
+ MI.dump();
+ assert(op1 == op2);
+ OpdsMapping[0] = AMDGPU::getValueMapping(op1, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(op1, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(op2, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(op3, Size);
+ break;
+ }
case AMDGPU::G_AND:
case AMDGPU::G_OR:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 148f15014b823..f24c2196d0af1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4702,6 +4702,42 @@ def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_SMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_SMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3-fmax3-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3-fmax3-combine.ll
new file mode 100644
index 0000000000000..68752a1c48fa0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3-fmax3-combine.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+
+define float @test_fmin3(float %a, float %b, float %c) {
+; GFX10-LABEL: test_fmin3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call float @llvm.minnum.f32(float %a, float %b)
+ %min2 = call float @llvm.minnum.f32(float %min1, float %c)
+ ret float %min2
+}
+
+define float @test_fmin3_nnan(float %a, float %b, float %c) {
+; GFX10-LABEL: test_fmin3_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %min2 = call nnan float @llvm.minnum.f32(float %min1, float %c)
+ ret float %min2
+}
+
+define float @test_fmin3_with_constants_nnan(float %a, float %b) {
+; GFX10-LABEL: test_fmin3_with_constants_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min3_f32 v0, v0, v1, 0x40e00000
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %min2 = call nnan float @llvm.minnum.f32(float %min1, float 7.0)
+ ret float %min2
+}
+
+define <2 x float> @test_fmin3_v2f32_nnan(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX10-LABEL: test_fmin3_v2f32_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min3_f32 v0, v0, v2, v4
+; GFX10-NEXT: v_min3_f32 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b)
+ %min2 = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> %min1, <2 x float> %c)
+ ret <2 x float> %min2
+}
+
+define float @test_fmax3(float %a, float %b, float %c) {
+; GFX10-LABEL: test_fmax3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %max1 = call float @llvm.maxnum.f32(float %a, float %b)
+ %max2 = call float @llvm.maxnum.f32(float %max1, float %c)
+ ret float %max2
+}
+
+define float @test_fmax3_nnan(float %a, float %b, float %c) {
+; GFX10-LABEL: test_fmax3_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %max1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %max2 = call nnan float @llvm.maxnum.f32(float %max1, float %c)
+ ret float %max2
+}
+
+define float @test_fmax3_with_constants_nnan(float %a, float %b) {
+; GFX10-LABEL: test_fmax3_with_constants_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max3_f32 v0, v0, v1, 0x40e00000
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %max1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %max2 = call nnan float @llvm.maxnum.f32(float %max1, float 7.0)
+ ret float %max2
+}
+
+define <2 x float> @test_fmax3_v2f32_nnan(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX10-LABEL: test_fmax3_v2f32_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max3_f32 v0, v0, v2, v4
+; GFX10-NEXT: v_max3_f32 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b)
+ %min2 = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> %min1, <2 x float> %c)
+ ret <2 x float> %min2
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b)
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b)
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/min3-max3-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/min3-max3-combine.ll
new file mode 100644
index 0000000000000..e4165c090c399
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/min3-max3-combine.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+
+define i32 @test_smin3(i32 %a, i32 %b, i32 %c) {
+; GFX10-LABEL: test_smin3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min3_i32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+ %min2 = call i32 @llvm.smin.i32(i32 %min1, i32 %c)
+ ret i32 %min2
+}
+
+define i32 @test_smin3_with_constants(i32 %a, i32 %b) {
+; GFX10-LABEL: test_smin3_with_constants:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min3_i32 v0, v0, v1, 7
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+ %min2 = call i32 @llvm.smin.i32(i32 %min1, i32 7)
+ ret i32 %min2
+}
+
+define i32 @test_smin3_smin_umin(i32 %a, i32 %b) {
+; GFX10-LABEL: test_smin3_smin_umin:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_min_i32_e32 v0, v0, v1
+; GFX10-NEXT: v_min_u32_e32 v0, 7, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+ %min2 = call i32 @llvm.umin.i32(i32 %min1, i32 7)
+ ret i32 %min2
+}
+
+define <2 x i16> @test_smin3_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GFX10-LABEL: test_smin3_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_min_i16 v0, v0, v1
+; GFX10-NEXT: v_pk_min_i16 v0, v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b)
+ %min2 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %min1, <2 x i16> %c)
+ ret <2 x i16> %min2
+}
+
+define i32 @test_smax3(i32 %a, i32 %b, i32 %c) {
+; GFX10-LABEL: test_smax3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max3_i32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %max1 = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %max2 = call i32 @llvm.smax.i32(i32 %max1, i32 %c)
+ ret i32 %max2
+}
+
+define i32 @test_smax3_with_constants(i32 %a, i32 %b) {
+; GFX10-LABEL: test_smax3_with_constants:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max3_i32 v0, v0, v1, 7
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %min2 = call i32 @llvm.smax.i32(i32 %min1, i32 7)
+ ret i32 %min2
+}
+
+define i32 @test_smin3_smax_umax(i32 %a, i32 %b) {
+; GFX10-LABEL: test_smin3_smax_umax:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX10-NEXT: v_max_u32_e32 v0, 7, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %min1 = call i32 @llvm.smax...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 8fd0af7fd..060dccf85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -74,7 +74,6 @@ public:
void applyClampI64ToI16(MachineInstr &MI,
const ClampI64ToI16MatchInfo &MatchInfo) const;
-
private:
#define GET_GICOMBINER_CLASS_MEMBERS
#define AMDGPUSubtarget GCNSubtarget
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 291397965..3c2be68e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -98,8 +98,10 @@ public:
bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
MachineInstr *SmallLoad, Register ToOverwriteD16) const;
- bool matchMinMaxToMinMax3(MachineInstr &MI, MinMaxToMinMax3MatchInfo &MatchInfo) const;
- void applyMinMaxToMinMax3(MachineInstr &MI, MinMaxToMinMax3MatchInfo &MatchInfo) const;
+ bool matchMinMaxToMinMax3(MachineInstr &MI,
+ MinMaxToMinMax3MatchInfo &MatchInfo) const;
+ void applyMinMaxToMinMax3(MachineInstr &MI,
+ MinMaxToMinMax3MatchInfo &MatchInfo) const;
private:
SIModeRegisterDefaults getMode() const;
@@ -517,8 +519,9 @@ bool AMDGPURegBankCombinerImpl::matchMinMaxToMinMax3(
Register R0, R1, R2;
unsigned opc = MI.getOpcode();
- auto matchMinOrMax3 = [&](MachineInstr &MI, MachineRegisterInfo &MRI, unsigned op,
- Register &r0, Register &r1, Register &r2) {
+ auto matchMinOrMax3 = [&](MachineInstr &MI, MachineRegisterInfo &MRI,
+ unsigned op, Register &r0, Register &r1,
+ Register &r2) {
auto p1 = m_BinOp(op, m_OneNonDBGUse(m_BinOp(op, m_Reg(r0), m_Reg(r1))),
m_Reg(r2));
auto p2 = m_BinOp(op, m_Reg(r0),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2b4674736..1c7add192 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3967,12 +3967,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_FMIN3: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
assert(Size == 32);
- auto op1 = getRegBankID(MI.getOperand(1).getReg(), MRI,
- AMDGPU::VCCRegBankID);
- auto op2 = getRegBankID(MI.getOperand(2).getReg(), MRI,
- AMDGPU::VCCRegBankID);
- auto op3 = getRegBankID(MI.getOperand(3).getReg(), MRI,
- AMDGPU::VCCRegBankID);
+ auto op1 =
+ getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::VCCRegBankID);
+ auto op2 =
+ getRegBankID(MI.getOperand(2).getReg(), MRI, AMDGPU::VCCRegBankID);
+ auto op3 =
+ getRegBankID(MI.getOperand(3).getReg(), MRI, AMDGPU::VCCRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(op1, Size);
OpdsMapping[1] = AMDGPU::getValueMapping(op1, Size);
OpdsMapping[2] = AMDGPU::getValueMapping(op2, Size);
|
9f49607 to
8ff7b88
Compare
| } | ||
| } | ||
|
|
||
| void AMDGPUPreLegalizerCombinerImpl::applyVOP3(MachineInstr &MI, |
There was a problem hiding this comment.
I would hope we can directly write this inline without the C++. I also wouldn't use the VOP3 name, that's overloaded on only indirectly related encoding
There was a problem hiding this comment.
Good point. I'll change it to pattern form when it's ready to merge.
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/GlobalISel/fmin3-fmax3-combine.llLLVM.CodeGen/AMDGPU/ctlz.llLLVM.CodeGen/AMDGPU/cttz.llLLVM.CodeGen/AMDGPU/vector-reduce-fmax.llLLVM.CodeGen/AMDGPU/vector-reduce-fmin.llLLVM.CodeGen/AMDGPU/vector-reduce-smax.llLLVM.CodeGen/AMDGPU/vector-reduce-smin.llLLVM.CodeGen/AMDGPU/vector-reduce-umax.llLLVM.CodeGen/AMDGPU/vector-reduce-umin.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
🪟 Windows x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/GlobalISel/fmin3-fmax3-combine.llLLVM.CodeGen/AMDGPU/ctlz.llLLVM.CodeGen/AMDGPU/cttz.llLLVM.CodeGen/AMDGPU/vector-reduce-fmax.llLLVM.CodeGen/AMDGPU/vector-reduce-fmin.llLLVM.CodeGen/AMDGPU/vector-reduce-smax.llLLVM.CodeGen/AMDGPU/vector-reduce-smin.llLLVM.CodeGen/AMDGPU/vector-reduce-umax.llLLVM.CodeGen/AMDGPU/vector-reduce-umin.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
|
consider test with uniform input new gpus will want to keep this on salu and select S_MIN_NUM_F32, also there might be some complications with canonicalization and variant of min/max used and different ieee formats. I assume it will not be easy like in this patch |
a9d849a to
45b0066
Compare
Original PR: #124263
Fixes: #123079
Do the optimization in the pre-legalizer phase.