Skip to content

Commit

Permalink
[AMDGPU] SDWA: add support for GFX9 in peephole pass
Browse files Browse the repository at this point in the history
Summary:
Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers.
Added several subtarget features for GFX9 SDWA.
This diff also contains changes from D34026.
Depends D34026

Reviewers: vpykhtin, rampitec, arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye

Differential Revision: https://reviews.llvm.org/D34241

llvm-svn: 305986
  • Loading branch information
SamWot committed Jun 22, 2017
1 parent 71e2c16 commit 3c4933f
Show file tree
Hide file tree
Showing 12 changed files with 347 additions and 116 deletions.
37 changes: 34 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Expand Up @@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
"Support SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
"HasSDWAOmod",
"true",
"Support OMod with SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
"HasSDWAScalar",
"true",
"Support scalar register with SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
"HasSDWASdst",
"true",
"Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
"HasSDWAMac",
"true",
"Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
"HasSDWAClampVOPC",
"true",
"Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
>;

def FeatureDPP : SubtargetFeature<"dpp",
"HasDPP",
"true",
Expand Down Expand Up @@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
FeatureDPP
FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
]
>;

Expand All @@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
FeatureFastFMAF32, FeatureDPP,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasScalarStores(false),
HasInv2PiInlineImm(false),
HasSDWA(false),
HasSDWAOmod(false),
HasSDWAScalar(false),
HasSDWASdst(false),
HasSDWAMac(false),
HasSDWAClampVOPC(false),
HasDPP(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Expand Up @@ -149,6 +149,11 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
bool HasScalarStores;
bool HasInv2PiInlineImm;
bool HasSDWA;
bool HasSDWAOmod;
bool HasSDWAScalar;
bool HasSDWASdst;
bool HasSDWAMac;
bool HasSDWAClampVOPC;
bool HasDPP;
bool FlatAddressSpace;
bool FlatInstOffsets;
Expand Down Expand Up @@ -431,6 +436,26 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
return HasSDWA;
}

bool hasSDWAOmod() const {
return HasSDWAOmod;
}

bool hasSDWAScalar() const {
return HasSDWAScalar;
}

bool hasSDWASdst() const {
return HasSDWASdst;
}

bool hasSDWAMac() const {
return HasSDWAMac;
}

bool hasSDWAClampVOPC() const {
return HasSDWAClampVOPC;
}

/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -2454,7 +2454,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
continue;
const MachineOperand &MO = MI.getOperand(OpIdx);

if (AMDGPU::isVI(ST)) {
if (!ST.hasSDWAScalar()) {
// Only VGPRS on VI
if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
Expand All @@ -2469,7 +2469,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}

if (AMDGPU::isVI(ST)) {
if (!ST.hasSDWAOmod()) {
// No omod allowed on VI
const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
if (OMod != nullptr &&
Expand All @@ -2481,14 +2481,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,

uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
if (isVOPC(BasicOpcode)) {
if (AMDGPU::isVI(ST) && DstIdx != -1) {
if (!ST.hasSDWASdst() && DstIdx != -1) {
// Only vcc allowed as dst on VI for VOPC
const MachineOperand &Dst = MI.getOperand(DstIdx);
if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
return false;
}
} else if (AMDGPU::isGFX9(ST)) {
} else if (!ST.hasSDWAClampVOPC()) {
// No clamp allowed on GFX9 for VOPC
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
if (Clamp != nullptr &&
Expand Down
87 changes: 59 additions & 28 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Expand Up @@ -67,9 +67,9 @@ class SIPeepholeSDWA : public MachineFunctionPass {

bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
bool isConvertibleToSDWA(const MachineInstr &MI) const;
bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI) const;
void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;

StringRef getPassName() const override { return "SI Peephole SDWA"; }

Expand Down Expand Up @@ -607,24 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
}
}

bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
const SISubtarget &ST) const {
// Check if this instruction has opcode that supports SDWA
unsigned Opc = MI.getOpcode();
if (AMDGPU::getSDWAOp(Opc) != -1)
return true;
int Opc32 = AMDGPU::getVOPe32(Opc);
if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1) {
if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
return false;
int Opc = MI.getOpcode();
if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);

if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
return false;

if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
return false;

if (TII->isVOPC(Opc)) {
if (TII->isVOPC(Opc)) {
if (!ST.hasSDWASdst()) {
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
return SDst && SDst->getReg() == AMDGPU::VCC;
} else {
return !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
if (SDst && SDst->getReg() != AMDGPU::VCC)
return false;
}

if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
return false;

} else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
return false;
}
return false;

if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;

return true;
}

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
Expand Down Expand Up @@ -690,13 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src2);
}

// Initialize clamp.
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1)
// Copy clamp if present, initialize otherwise
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
if (Clamp) {
SDWAInst.add(*Clamp);
} else {
SDWAInst.addImm(0);
}

// Initialize omod.
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1)
// Copy omod if present, initialize otherwise if needed
MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
if (OMod) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
SDWAInst.add(*OMod);
} else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
SDWAInst.addImm(0);
}

// Initialize dst_sel and dst_unused if present
if (Dst) {
Expand Down Expand Up @@ -750,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}

// If an instruction was converted to SDWA it should not have immediates or SGPR
// operands. Copy its scalar operands into VGPRs.
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
MachineOperand &Op = MI.getOperand(I);
unsigned ConstantBusCount = 0;
for (MachineOperand &Op: MI.explicit_uses()) {
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;

unsigned I = MI.getOperandNo(&Op);
if (Desc.OpInfo[I].RegClass == -1 ||
!TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
continue;

if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
TRI->isSGPRReg(*MRI, Op.getReg())) {
++ConstantBusCount;
continue;
}

unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
Expand All @@ -775,10 +808,8 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

if (!ST.hasSDWA() ||
!AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
if (!ST.hasSDWA())
return false;
}

MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
Expand All @@ -790,7 +821,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
Expand All @@ -805,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {

bool Ret = !ConvertedInstructions.empty();
while (!ConvertedInstructions.empty())
legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);

return Ret;
}
4 changes: 0 additions & 4 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Expand Up @@ -401,10 +401,6 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;

// string Mnemonic = ps.Mnemonic;
// string AsmOperands = ps.AsmOperands;
// string AsmOperands9 = ps.AsmOperands9;

// Copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AssemblerPredicate = ps.AssemblerPredicate;
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fneg.f16.ll
Expand Up @@ -134,11 +134,10 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}

; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
%val = load <2 x half>, <2 x half> addrspace(1)* %in
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s

; GCN-LABEL: {{^}}fpext_f16_to_f32
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
Expand Down Expand Up @@ -35,11 +35,10 @@ entry:

; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
; GCN: s_endpgm

Expand All @@ -55,9 +54,9 @@ entry:

; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
; GCN: buffer_load_dword
; SIGFX9-DAG: v_lshrrev_b32_e32
; SIGFX9-DAG: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
; SI-DAG: v_lshrrev_b32_e32
; SI-DAG: v_cvt_f32_f16_e32
; GFX89: v_cvt_f32_f16_sdwa
; GCN: v_cvt_f32_f16_e32

; GCN: v_cvt_f64_f32_e32
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
Expand Up @@ -42,13 +42,13 @@ entry:
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]

; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm

define amdgpu_kernel void @rint_v2f16(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
Expand Down

0 comments on commit 3c4933f

Please sign in to comment.