Skip to content

Commit

Permalink
[AMDGPU][GlobalISel] Support mad/fma_mix selection
Browse files Browse the repository at this point in the history
Adds support for selecting the following instructions using GlobalISel:
- v_mad_mix/v_fma_mix
- v_mad_mixhi/v_fma_mixhi
- v_mad_mixlo/v_fma_mixlo

To select those instructions properly, some additional changes were
needed which impacted other tests as well.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D134354
  • Loading branch information
Pierre-vh committed Nov 8, 2022
1 parent d1f90b6 commit 767999f
Show file tree
Hide file tree
Showing 11 changed files with 3,375 additions and 1,717 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Expand Up @@ -153,6 +153,10 @@ def gi_smrd_buffer_sgpr_imm :
GIComplexOperandMatcher<s64, "selectSMRDBufferSgprImm">,
GIComplexPatternEquiv<SMRDBufferSgprImm>;

def gi_vop3_mad_mix_mods :
GIComplexOperandMatcher<s64, "selectVOP3PMadMixMods">,
GIComplexPatternEquiv<VOP3PMadMixMods>;

// Separate load nodes are defined to glue m0 initialization in
// SelectionDAG. The GISel selector can just insert m0 initialization
// directly before selecting a glue-less load, so hide this
Expand Down
190 changes: 190 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Expand Up @@ -522,6 +522,60 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const {
assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD);

// Try to manually select MAD_MIX/FMA_MIX.
Register Dst = I.getOperand(0).getReg();
LLT ResultTy = MRI->getType(Dst);
bool IsFMA = I.getOpcode() == AMDGPU::G_FMA;
if (ResultTy != LLT::scalar(32) ||
(IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts()))
return false;

// Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
// using the conversion from f16.
bool MatchedSrc0, MatchedSrc1, MatchedSrc2;
auto [Src0, Src0Mods] =
selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0);
auto [Src1, Src1Mods] =
selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1);
auto [Src2, Src2Mods] =
selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2);

#ifndef NDEBUG
const SIMachineFunctionInfo *MFI =
I.getMF()->getInfo<SIMachineFunctionInfo>();
AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
assert((IsFMA || !Mode.allFP32Denormals()) &&
"fmad selected with denormals enabled");
#endif

// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2)
return false;

const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32;
MachineInstr *MixInst =
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst)
.addImm(Src0Mods)
.addReg(Src0)
.addImm(Src1Mods)
.addReg(Src1)
.addImm(Src2Mods)
.addReg(Src2)
.addImm(0)
.addImm(0)
.addImm(0);

if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI))
return false;

I.eraseFromParent();
return true;
}

bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
Register DstReg = MI.getOperand(0).getReg();
Expand Down Expand Up @@ -3228,6 +3282,11 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_FABS(I);
case TargetOpcode::G_EXTRACT:
return selectG_EXTRACT(I);
case TargetOpcode::G_FMA:
case TargetOpcode::G_FMAD:
if (selectG_FMA_FMAD(I))
return true;
return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_MERGE_VALUES:
case TargetOpcode::G_CONCAT_VECTORS:
return selectG_MERGE_VALUES(I);
Expand Down Expand Up @@ -4679,6 +4738,137 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}

// Variant of stripBitCast that returns the instruction instead of a
// MachineOperand.
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
if (MI->getOpcode() == AMDGPU::G_BITCAST)
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
return MI;
}

// Figure out if this is really an extract of the high 16-bits of a dword,
// returns nullptr if it isn't.
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
MachineRegisterInfo &MRI) {
Inst = stripBitCast(Inst, MRI);

if (Inst->getOpcode() != AMDGPU::G_TRUNC)
return nullptr;

MachineInstr *TruncOp =
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
TruncOp = stripBitCast(TruncOp, MRI);

// G_LSHR x, (G_CONSTANT i32 16)
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
auto SrlAmount = getIConstantVRegValWithLookThrough(
TruncOp->getOperand(2).getReg(), MRI);
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
MachineInstr *SrlOp =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(SrlOp, MRI);
}
}

// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
// 1, 0 swaps the low/high 16 bits.
// 1, 1 sets the high 16 bits to be the same as the low 16.
// in any case, it selects the high elts.
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));

ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);

if (Mask[0] == 1 && Mask[1] <= 1) {
MachineInstr *LHS =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(LHS, MRI);
}
}

return nullptr;
}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
Matched = false;

Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);

MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
MachineOperand *MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);

assert(MRI->getType(Src) == LLT::scalar(16));

// See through bitcasts.
// FIXME: Would be nice to use stripBitCast here.
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
}

const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
MI = getDefIgnoringCopies(Src, *MRI);

if ((ModsTmp & SISrcMods::NEG) != 0)
Mods ^= SISrcMods::NEG;

if ((ModsTmp & SISrcMods::ABS) != 0)
Mods |= SISrcMods::ABS;
}
};

CheckAbsNeg();

// op_sel/op_sel_hi decide the source type and source.
// If the source's op_sel_hi is set, it indicates to do a conversion from
// fp16. If the sources's op_sel is set, it picks the high half of the
// source register.

Mods |= SISrcMods::OP_SEL_1;

if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
Mods |= SISrcMods::OP_SEL_0;
MI = ExtractHiEltMI;
MO = &MI->getOperand(0);
Src = MO->getReg();

CheckAbsNeg();
}

Matched = true;
}

return {Src, Mods};
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
bool Matched;
std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);

return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}

void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Expand Up @@ -97,6 +97,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_FMA_FMAD(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
bool selectG_BUILD_VECTOR(MachineInstr &I) const;
Expand Down Expand Up @@ -293,6 +294,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const;
ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const;

std::pair<Register, unsigned> selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const;
ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const;

void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Expand Up @@ -168,7 +168,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
$elt0))
VGPR_32:$elt0))
>;

def : GCNPat <
Expand All @@ -181,7 +181,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
$elt0))
VGPR_32:$elt0))
>;

def : GCNPat <
Expand Down

0 comments on commit 767999f

Please sign in to comment.