Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Add floating point med3 combine
Browse files Browse the repository at this point in the history
Add floating point version of med3 combine.
Source is fminnum(fmaxnum(Val, K0), K1) or fmaxnum(fminnum(Val, K1), K0)
where K0 and K1 are constants and K0 <= K1.

Differential Revision: https://reviews.llvm.org/D90051
  • Loading branch information
petar-avramovic committed Dec 3, 2021
1 parent ab01f4d commit ec54867
Show file tree
Hide file tree
Showing 6 changed files with 912 additions and 2 deletions.
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Expand Up @@ -64,6 +64,15 @@ def int_minmax_to_med3 : GICombineRule<
[{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;

def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
(match (wip_match_opcode G_FMAXNUM,
G_FMINNUM,
G_FMAXNUM_IEEE,
G_FMINNUM_IEEE):$min_or_max,
[{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;

def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;

def remove_fcanonicalize : GICombineRule<
Expand Down Expand Up @@ -102,7 +111,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
}

def AMDGPURegBankCombinerHelper : GICombinerHelper<
"AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
"AMDGPUGenRegBankCombinerHelper",
[zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_med3]> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
let StateClass = "AMDGPURegBankCombinerHelperState";
let AdditionalArguments = [];
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Expand Up @@ -172,6 +172,7 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;

def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
Expand Down
81 changes: 80 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
Expand Up @@ -16,6 +16,7 @@
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
Expand All @@ -36,13 +37,15 @@ class AMDGPURegBankCombinerHelper {
MachineRegisterInfo &MRI;
const RegisterBankInfo &RBI;
const TargetRegisterInfo &TRI;
const SIInstrInfo &TII;
CombinerHelper &Helper;

public:
AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
: B(B), MF(B.getMF()), MRI(*B.getMRI()),
RBI(*MF.getSubtarget().getRegBankInfo()),
TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
TRI(*MF.getSubtarget().getRegisterInfo()),
TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};

bool isVgprRegBank(Register Reg);
Register getAsVgpr(Register Reg);
Expand All @@ -63,7 +66,13 @@ class AMDGPURegBankCombinerHelper {
Register &Val, CstTy &K0, CstTy &K1);

bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);

private:
AMDGPU::SIModeRegisterDefaults getMode();
bool getIEEE();
bool isFminnumIeee(const MachineInstr &MI);
};

bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
Expand Down Expand Up @@ -98,6 +107,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
case AMDGPU::G_UMAX:
case AMDGPU::G_UMIN:
return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM:
return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINNUM_IEEE:
return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
AMDGPU::G_AMDGPU_FMED3};
}
}

Expand Down Expand Up @@ -148,6 +164,59 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
return true;
}

// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
// ieee = false : min/max(NaN, K) = K
// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
// Other operand commutes (see matchMed) give same result since min and max are
// commutative.

// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
// with fmed3(Val, K0, K1).
// Val = SNaN only for ieee = true
// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
// max(min(SNaN, K1), K0) = max(K1, K0) = K1
// Val = NaN,ieee = false or Val = QNaN,ieee = true
// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
// min(max(NaN, K0), K1) = min(K0, K1) = K0
// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
MachineInstr &MI, Med3MatchInfo &MatchInfo) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
return false;

auto OpcodeTriple = getMinMaxPair(MI.getOpcode());

Register Val;
Optional<FPValueAndVReg> K0, K1;
// Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
return false;

if (K0->Value > K1->Value)
return false;

// For IEEE=false perform combine only when it's safe to assume that there are
// no NaN inputs. Most often MI is marked with nnan fast math flag.
// For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
// min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
// nodes(max/min) have same behavior when one input is NaN and other isn't.
// Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
// also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
// Don't fold single use constant that can't be inlined.
if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
(!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
return true;
}
}

return false;
}
void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
Med3MatchInfo &MatchInfo) {
B.setInstrAndDebugLoc(MI);
Expand All @@ -158,6 +227,16 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}

AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}

bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; }

bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) {
return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
}

class AMDGPURegBankCombinerHelperState {
protected:
CombinerHelper &Helper;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -2866,6 +2866,12 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}

def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
let hasSideEffects = 0;
}

// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
Expand Down

0 comments on commit ec54867

Please sign in to comment.