Skip to content

Commit

Permalink
[AMDGPU] gfx1010: use fmac instructions
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D61527

llvm-svn: 359959
  • Loading branch information
rampitec committed May 4, 2019
1 parent 37be336 commit 28a1936
Show file tree
Hide file tree
Showing 11 changed files with 1,004 additions and 229 deletions.
8 changes: 5 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -521,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())
if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);

for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Expand Down Expand Up @@ -8723,8 +8723,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,

// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
(VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
(VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
getSubtarget()->hasMadF16())) &&
isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;

const TargetOptions &Options = DAG.getTarget().Options;
Expand Down
106 changes: 75 additions & 31 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -2071,7 +2071,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}

if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
Expand All @@ -2086,7 +2088,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;

bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);

Expand All @@ -2099,6 +2104,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// We need to swap operands 0 and 1 since madmk constant is at operand 1.

const int64_t Imm = ImmOp->getImm();
Expand All @@ -2119,14 +2130,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(Src1->isKill());

if (Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAC_F16_e64)
Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

Src1->ChangeToImmediate(Imm);

removeModOperands(UseMI);
UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
UseMI.setDesc(get(NewOpc));

bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
Expand Down Expand Up @@ -2176,6 +2189,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// VGPR is okay as Src1 - fallthrough
}

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

const int64_t Imm = ImmOp->getImm();

// FIXME: This would be a lot easier if we could return a new instruction
Expand All @@ -2188,7 +2207,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));

if (Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAC_F16_e64)
Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

Expand All @@ -2197,7 +2218,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

// These come before src2.
removeModOperands(UseMI);
UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
UseMI.setDesc(get(NewOpc));

bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
Expand Down Expand Up @@ -2310,18 +2331,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
LiveVariables *LV) const {
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;

switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
Expand Down Expand Up @@ -2350,42 +2374,50 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);

if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 ||
!Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
.add(*Dst)
.add(*Src0)
.add(*Src1)
.addImm(Imm);
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1)
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src0)
.add(*Src1)
.addImm(Imm);
}
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
.add(*Dst)
.add(*Src0)
.addImm(Imm)
.add(*Src2);
if (pseudoToMCOpcode(NewOpc) != -1)
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src0)
.addImm(Imm)
.add(*Src2);
}
if (auto Imm = getFoldableImm(Src0)) {
if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
if (pseudoToMCOpcode(NewOpc) != -1 &&
isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::src0), Src1))
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src1)
.addImm(Imm)
.add(*Src2);
}
}

assert((!IsFMA || !IsF16) && "fmac only expected with f32");
unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
: (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;

return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
Expand Down Expand Up @@ -2678,6 +2710,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
Expand Down Expand Up @@ -3410,13 +3443,16 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
unsigned Size = TRI->getRegSizeInBits(*RC);
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
Opcode = AMDGPU::S_MOV_B32;
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;

const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
Expand Down Expand Up @@ -5332,6 +5368,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
}

uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
return (16ULL << 44) | // IMG_FORMAT_32_FLOAT
(1ULL << 56) | // RESOURCE_LEVEL = 1
(3ULL << 60); // OOB_SELECT = 3
}

uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
Expand All @@ -5358,12 +5400,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}

// IndexStride = 64.
Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// IndexStride = 64 / 32.
uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2;
Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;

// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
ST.getGeneration() <= AMDGPUSubtarget::GFX9)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;

return Rsrc23;
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1462,7 +1462,7 @@ def : GCNPat<

def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
(V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}

Expand Down Expand Up @@ -1523,6 +1523,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]

let SubtargetPredicate = isGFX10Plus in
def : GCNPat <
(fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
SRCMODS.NONE, $src2, $clamp, $omod)
>;

// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
Expand Down
20 changes: 16 additions & 4 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Expand Up @@ -418,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
}
assert(Src && Src->isReg());

if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
Expand Down Expand Up @@ -460,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused

if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
getDstSel() != AMDGPU::SDWA::DWORD) {
// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
Expand Down Expand Up @@ -964,10 +968,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return false;
}

if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
Opc == AMDGPU::V_FMAC_F32_e32 ||
Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;

// Check if target supports this SDWA opcode
if (TII->pseudoToMCOpcode(Opc) == -1)
return false;

// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
Expand Down Expand Up @@ -1038,7 +1048,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src1);
}

if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
// v_mac_f16/32 has additional src2 operand tied to vdst
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
Expand Down

0 comments on commit 28a1936

Please sign in to comment.