162 changes: 130 additions & 32 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
if (!TFE && !LWE) // intersect_ray
continue;

unsigned TFEVal = TFE->getImm();
unsigned TFEVal = TFE ? TFE->getImm() : 0;
unsigned LWEVal = LWE->getImm();
unsigned D16Val = D16 ? D16->getImm() : 0;

Expand Down
19 changes: 14 additions & 5 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ namespace AMDGPU {
OPERAND_REG_IMM_FP16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
OPERAND_REG_IMM_V2INT32,
OPERAND_REG_IMM_V2FP32,

/// Operands with register or inline constant
OPERAND_REG_INLINE_C_INT16,
Expand All @@ -150,25 +152,30 @@ namespace AMDGPU {
OPERAND_REG_INLINE_C_FP16,
OPERAND_REG_INLINE_C_FP32,
OPERAND_REG_INLINE_C_FP64,
OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT16,
OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT32,
OPERAND_REG_INLINE_C_V2FP32,

/// Operands with an AccVGPR register or inline constant
OPERAND_REG_INLINE_AC_INT16,
OPERAND_REG_INLINE_AC_INT32,
OPERAND_REG_INLINE_AC_FP16,
OPERAND_REG_INLINE_AC_FP32,
OPERAND_REG_INLINE_AC_V2FP16,
OPERAND_REG_INLINE_AC_FP64,
OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_REG_INLINE_AC_V2FP16,
OPERAND_REG_INLINE_AC_V2INT32,
OPERAND_REG_INLINE_AC_V2FP32,

OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,

OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,

OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,

OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
Expand Down Expand Up @@ -680,6 +687,8 @@ enum DppCtrl : unsigned {
BCAST31 = 0x143,
DPP_UNUSED8_FIRST = 0x144,
DPP_UNUSED8_LAST = 0x14F,
ROW_NEWBCAST_FIRST= 0x150,
ROW_NEWBCAST_LAST = 0x15F,
ROW_SHARE_FIRST = 0x150,
ROW_SHARE_LAST = 0x15F,
ROW_XMASK_FIRST = 0x160,
Expand Down
225 changes: 223 additions & 2 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ class SIFoldOperands : public MachineFunctionPass {

std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
bool tryFoldRegSeqence(MachineInstr &MI);
bool tryFoldLCSSAPhi(MachineInstr &MI);
bool tryFoldLoad(MachineInstr &MI);

public:
SIFoldOperands() : MachineFunctionPass(ID) {
Expand Down Expand Up @@ -135,6 +138,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F64_e64:
return AMDGPU::V_FMA_F64_e64;
}
return AMDGPU::INSTRUCTION_LIST_END;
}
Expand Down Expand Up @@ -531,8 +536,10 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false;

uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
(OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
return false;

if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
Expand All @@ -554,6 +561,19 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false;

MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();

// Maybe it is just a COPY of an immediate itself.
MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
MachineOperand &DefOp = Def->getOperand(1);
if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
return true;
}
}

SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
return false;
Expand Down Expand Up @@ -825,6 +845,10 @@ void SIFoldOperands::foldOperand(
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
else if (ST->hasGFX90AInsts() &&
TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
return;
}

Expand Down Expand Up @@ -1502,6 +1526,194 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
return true;
}

// Try to fold a reg_sequence with vgpr output and agpr inputs into an
// instruction which can take an agpr. So far that means a store.
bool SIFoldOperands::tryFoldRegSeqence(MachineInstr &MI) {
assert(MI.isRegSequence());
auto Reg = MI.getOperand(0).getReg();

if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
!MRI->hasOneNonDBGUse(Reg))
return false;

SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
return false;

for (auto &Def : Defs) {
const auto *Op = Def.first;
if (!Op->isReg())
return false;
if (TRI->isAGPR(*MRI, Op->getReg()))
continue;
// Maybe this is a COPY from AREG
const MachineInstr *SubDef = MRI->getUniqueVRegDef(Op->getReg());
if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
return false;
if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
return false;
}

MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
MachineInstr *UseMI = Op->getParent();
while (UseMI->isCopy() && !Op->getSubReg()) {
Reg = UseMI->getOperand(0).getReg();
if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
return false;
Op = &*MRI->use_nodbg_begin(Reg);
UseMI = Op->getParent();
}

if (Op->getSubReg())
return false;

unsigned OpIdx = Op - &UseMI->getOperand(0);
const MCInstrDesc &InstDesc = UseMI->getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
switch (OpInfo.RegClass) {
case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH;
case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH;
case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH;
case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
case AMDGPU::AV_160RegClassID:
break;
default:
return false;
}

const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
auto Dst = MRI->createVirtualRegister(NewDstRC);
auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(AMDGPU::REG_SEQUENCE), Dst);

for (unsigned I = 0; I < Defs.size(); ++I) {
MachineOperand *Def = Defs[I].first;
Def->setIsKill(false);
if (TRI->isAGPR(*MRI, Def->getReg())) {
RS.add(*Def);
} else { // This is a copy
MachineInstr *SubDef = MRI->getUniqueVRegDef(Def->getReg());
SubDef->getOperand(1).setIsKill(false);
RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
}
RS.addImm(Defs[I].second);
}

Op->setReg(Dst);
if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
Op->setReg(Reg);
RS->eraseFromParent();
return false;
}

LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI << '\n');

return true;
}

// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
// This should allow folding of an AGPR into a consumer which may support it.
// I.e.:
//
// loop: // loop:
// %1:vreg = COPY %0:areg // exit:
// exit: => // %1:areg = PHI %0:areg, %loop
// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
assert(PHI.isPHI());

if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
return false;

Register PhiIn = PHI.getOperand(1).getReg();
Register PhiOut = PHI.getOperand(0).getReg();
if (PHI.getOperand(1).getSubReg() ||
!TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
return false;

// A single use should not matter for correctness, but if it has another use
// inside the loop we may perform copy twice in a worst case.
if (!MRI->hasOneNonDBGUse(PhiIn))
return false;

MachineInstr *Copy = MRI->getUniqueVRegDef(PhiIn);
if (!Copy || !Copy->isCopy())
return false;

Register CopyIn = Copy->getOperand(1).getReg();
if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
return false;

const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
Register NewReg = MRI->createVirtualRegister(ARC);
PHI.getOperand(1).setReg(CopyIn);
PHI.getOperand(0).setReg(NewReg);

MachineBasicBlock *MBB = PHI.getParent();
BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
TII->get(AMDGPU::COPY), PhiOut)
.addReg(NewReg, RegState::Kill);
Copy->eraseFromParent(); // We know this copy had a single use.

LLVM_DEBUG(dbgs() << "Folded " << PHI << '\n');

return true;
}

// Attempt to convert VGPR load to an AGPR load.
bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
assert(MI.mayLoad());
if (!ST->hasGFX90AInsts() || !MI.getNumOperands())
return false;

MachineOperand &Def = MI.getOperand(0);
if (!Def.isDef())
return false;

Register DefReg = Def.getReg();

if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
return false;

SmallVector<const MachineInstr*, 8> Users;
SmallVector<Register, 8> MoveRegs;
for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
Users.push_back(&I);
}
if (Users.empty())
return false;

// Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
while (!Users.empty()) {
const MachineInstr *I = Users.pop_back_val();
if (!I->isCopy() && !I->isRegSequence())
return false;
Register DstReg = I->getOperand(0).getReg();
if (TRI->isAGPR(*MRI, DstReg))
continue;
MoveRegs.push_back(DstReg);
for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
Users.push_back(&U);
}
}

const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
if (!TII->isOperandLegal(MI, 0, &Def)) {
MRI->setRegClass(DefReg, RC);
return false;
}

while (!MoveRegs.empty()) {
Register Reg = MoveRegs.pop_back_val();
MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
}

LLVM_DEBUG(dbgs() << "Folded " << MI << '\n');

return true;
}

bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
Expand Down Expand Up @@ -1529,6 +1741,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {

tryFoldInst(TII, &MI);

if (MI.isRegSequence() && tryFoldRegSeqence(MI))
continue;

if (MI.isPHI() && tryFoldLCSSAPhi(MI))
continue;

if (MI.mayLoad() && tryFoldLoad(MI))
continue;

if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.
if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
Expand Down
15 changes: 14 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,24 @@ void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
return MaskA.getHighestLane() > MaskB.getHighestLane();
});

MCRegister RepReg;
for (MCRegister R : *MRI->getRegClass(Reg)) {
if (!MRI->isReserved(R)) {
RepReg = R;
break;
}
}
if (!RepReg)
llvm_unreachable("Failed to find required allocatable register");

for (unsigned Idx : CoveringSubregs) {
LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
continue;

if (MRI->isReserved(TRI->getSubReg(RepReg, Idx)))
continue;

Func(Idx);
LaneMask &= ~SubRegMask;
if (LaneMask.none())
Expand Down Expand Up @@ -261,7 +274,7 @@ bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
// tracking does not account for the alignment requirements for SGPRs, or the
// fragmentation of registers the allocator will need to satisfy.
if (Occupancy >= MFI->getMinAllowedOccupancy() &&
MaxPressure.getVGPRNum() <= MaxVGPRs / 2 &&
MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
LastRecordedOccupancy = Occupancy;
return true;
Expand Down
19 changes: 17 additions & 2 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // dlc
.addImm(0) // scc
.addMemOperand(MMO);
return;
}
Expand All @@ -152,6 +153,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.addMemOperand(MMO);
return;
}
Expand Down Expand Up @@ -181,6 +183,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // dlc
.addImm(0) // scc
.addMemOperand(MMO);

if (!HasOffsetReg) {
Expand All @@ -207,6 +210,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.addMemOperand(MMO);
} else {
// No free register, use stack pointer and restore afterwards.
Expand All @@ -224,6 +228,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.addMemOperand(MMO);

BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg)
Expand Down Expand Up @@ -257,6 +262,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // dlc
.addImm(0) // scc
.addMemOperand(MMO);
return;
}
Expand All @@ -275,6 +281,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // dlc
.addImm(0) // scc
.addMemOperand(MMO);
return;
}
Expand All @@ -290,6 +297,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.addMemOperand(MMO);
return;
}
Expand All @@ -313,6 +321,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.addMemOperand(MMO);
}

Expand Down Expand Up @@ -1311,7 +1320,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const SIRegisterInfo *TRI = ST.getRegisterInfo();

// Ignore the SGPRs the default implementation found.
SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());

// Do not save AGPRs prior to GFX90A because there was no easy way to do so.
// In gfx908 there was do AGPR loads and stores and thus spilling also
// require a temporary VGPR.
if (!ST.hasGFX90AInsts())
SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());

// hasFP only knows about stack objects that already exist. We're now
// determining the stack slots that will be created, so we have to predict
Expand Down Expand Up @@ -1366,7 +1381,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
SavedRegs.reset(MFI->getStackPtrOffsetReg());

const BitVector AllSavedRegs = SavedRegs;
SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());

// If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
Expand Down
203 changes: 151 additions & 52 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);

if (Subtarget->hasPackedFP32Ops()) {
setOperationAction(ISD::FADD, MVT::v2f32, Legal);
setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
setOperationAction(ISD::FMA, MVT::v2f32, Legal);
setOperationAction(ISD::FNEG, MVT::v2f32, Legal);

for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FMUL, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
}
}
}

setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
Expand Down Expand Up @@ -1128,17 +1141,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Expand All @@ -1150,6 +1152,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MODereferenceable;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
Expand Down Expand Up @@ -1191,6 +1209,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Expand Down Expand Up @@ -1799,23 +1820,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

CCInfo.AllocateReg(Reg);
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
unsigned Mask = (Subtarget->hasPackedTID() &&
Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
}

if (Info.hasWorkItemIDY()) {
Register Reg = AMDGPU::VGPR1;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
assert(Info.hasWorkItemIDX());
if (Subtarget->hasPackedTID()) {
Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
0x3ff << 10));
} else {
unsigned Reg = AMDGPU::VGPR1;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
}

if (Info.hasWorkItemIDZ()) {
Register Reg = AMDGPU::VGPR2;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
if (Subtarget->hasPackedTID()) {
Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
0x3ff << 20));
} else {
unsigned Reg = AMDGPU::VGPR2;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
}
}
}

Expand Down Expand Up @@ -4380,7 +4415,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);

SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
Expand All @@ -4401,7 +4437,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);

SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
Expand Down Expand Up @@ -6168,6 +6205,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
if (!IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(0, SDLoc(), MVT::i1));
if (IsGFX10Plus)
Ops.push_back(DLC);
Ops.push_back(GLC);
Expand All @@ -6176,8 +6215,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
Ops.push_back(TFE);
Ops.push_back(LWE);
if (!Subtarget->hasGFX90AInsts()) {
Ops.push_back(TFE); //tfe
} else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
report_fatal_error("TFE is not supported on this GPU");
}
Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Expand All @@ -6195,7 +6238,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
if (Subtarget->hasGFX90AInsts()) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
report_fatal_error(
"requested image instruction is not supported on this GPU");
}
if (Opcode == -1 &&
Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
Expand Down Expand Up @@ -7062,7 +7113,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
if (!Op.getValue(0).use_empty()) {
if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
Expand All @@ -7083,6 +7134,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
Expand Down Expand Up @@ -7208,27 +7267,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
case Intrinsic::amdgcn_global_atomic_fadd: {
if (!Op.getValue(0).use_empty()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
DL.getDebugLoc(), DS_Error);
DAG.getContext()->diagnose(NoFpRet);
return SDValue();
}
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
M->getOperand(3) // Value
};

EVT VT = Op.getOperand(3).getValueType();
return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
DAG.getVTList(VT, MVT::Other), Ops,
M->getMemOperand());
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
Expand Down Expand Up @@ -7299,7 +7337,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
return SDValue(NewNode, 0);
}
case Intrinsic::amdgcn_global_atomic_fadd:
if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
DL.getDebugLoc(), DS_Error);
DAG.getContext()->diagnose(NoFpRet);
return SDValue();
}
LLVM_FALLTHROUGH;
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
M->getOperand(3) // Value
};
unsigned Opcode = 0;
switch (IntrID) {
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fadd: {
EVT VT = Op.getOperand(3).getValueType();
return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
DAG.getVTList(VT, MVT::Other), Ops,
M->getMemOperand());
}
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
break;
}
default:
llvm_unreachable("unhandled atomic opcode");
}
return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
default:

if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
return lowerImage(Op, ImageDimIntr, DAG, true);
Expand Down Expand Up @@ -10813,7 +10899,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned NewDmask = 0;
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
Expand Down Expand Up @@ -11768,26 +11854,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (Ty->isHalfTy())
return AtomicExpansionKind::None;

if (!Ty->isFloatTy())
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;

// TODO: Do have these for flat. Older targets also had them for buffers.
unsigned AS = RMW->getPointerAddressSpace();

if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
Subtarget->hasAtomicFaddInsts()) {
if (!fpModeMatchesGlobalFPAtomicMode(RMW) ||
RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics")
.getValueAsString() != "true")
return AtomicExpansionKind::CmpXChg;

if (Subtarget->hasGFX90AInsts())
return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ?
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;

if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS)
return AtomicExpansionKind::CmpXChg;

return RMW->use_empty() ? AtomicExpansionKind::None :
AtomicExpansionKind::CmpXChg;
}

// DS FP atomics do repect the denormal mode, but the rounding mode is fixed
// to round-to-nearest-even.
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
return (Ty->isDoubleTy() && !fpModeMatchesGlobalFPAtomicMode(RMW)) ?
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
}

return AtomicExpansionKind::CmpXChg;
}
default:
break;
Expand Down
24 changes: 16 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
Expand Down Expand Up @@ -451,8 +452,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const SIRegisterInfo *TRI,
unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
assert(Op.isReg());
if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};

// A use via a PW operand does not need a waitcnt.
Expand All @@ -463,9 +463,11 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,

unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));

if (TRI->isVGPR(*MRI, Op.getReg())) {
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
Result.first = Reg - RegisterEncoding.VGPR0;
if (TRI->isAGPR(*MRI, Op.getReg()))
Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
Expand All @@ -491,7 +493,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const MachineRegisterInfo *MRI, unsigned OpNo,
unsigned Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
Expand Down Expand Up @@ -549,7 +551,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
if (Op.isReg() && !Op.isDef() &&
TRI->isVectorRegister(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
Expand Down Expand Up @@ -606,7 +609,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
if (MO.isReg() && !MO.isDef() &&
TRI->isVectorRegister(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
Expand Down Expand Up @@ -1003,7 +1007,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
RegInterval Interval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);

const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
Expand Down Expand Up @@ -1208,6 +1212,10 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
if (!TII->usesLGKM_CNT(MI))
return false;

// If in tgsplit mode then there can be no use of LDS.
if (ST->isTgSplitEnabled())
return false;

// If there are no memory operands then conservatively assume the flat
// operation may access LDS.
if (MI.memoperands_empty())
Expand Down
22 changes: 19 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class VINTRPe <bits<2> op> : Enc32 {
}

class MIMGe : Enc64 {
bits<8> vdata;
bits<10> vdata;
bits<4> dmask;
bits<1> unorm;
bits<1> glc;
Expand All @@ -294,11 +294,10 @@ class MIMGe : Enc64 {
let Inst{12} = unorm;
let Inst{13} = glc;
let Inst{15} = r128;
let Inst{16} = tfe;
let Inst{17} = lwe;
let Inst{25} = slc;
let Inst{31-26} = 0x3c;
let Inst{47-40} = vdata;
let Inst{47-40} = vdata{7-0};
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
Expand All @@ -307,9 +306,25 @@ class MIMGe : Enc64 {
class MIMGe_gfx6789 <bits<8> op> : MIMGe {
bits<8> vaddr;
bits<1> da;
bits<1> sccb;

let Inst{0} = op{7};
let Inst{7} = sccb;
let Inst{14} = da;
let Inst{16} = tfe;
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr;
}

class MIMGe_gfx90a <bits<8> op> : MIMGe {
bits<8> vaddr;
bits<1> da;
bits<1> sccb;

let Inst{0} = op{7};
let Inst{7} = sccb;
let Inst{14} = da;
let Inst{16} = vdata{9}; // ACC bit
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr;
}
Expand All @@ -325,6 +340,7 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
let Inst{2-1} = nsa;
let Inst{5-3} = dim;
let Inst{7} = dlc;
let Inst{16} = tfe;
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr0;
let Inst{62} = a16;
Expand Down
323 changes: 289 additions & 34 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return RI;
}

const GCNSubtarget &getSubtarget() const {
return ST;
}

bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const override;

Expand Down Expand Up @@ -1085,11 +1089,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
const TargetRegisterInfo *TRI,
const MachineFunction &MF)
const override {
if (OpNum >= TID.getNumOperands())
return nullptr;
return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
}
const override;

void fixImplicitOperands(MachineInstr &MI) const;

Expand Down
119 changes: 101 additions & 18 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def SIEncodingFamily {
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
int GFX90A = 8;
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -186,6 +187,8 @@ def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;

def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
Expand Down Expand Up @@ -265,21 +268,25 @@ class isFloatType<ValueType SrcVT> {
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),
!eq(SrcVT.Value, v2f32.Value),
!eq(SrcVT.Value, v2f64.Value));
!eq(SrcVT.Value, v2f64.Value),
!eq(SrcVT.Value, v4f64.Value));
}

class isIntType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
!eq(SrcVT.Value, i64.Value));
!eq(SrcVT.Value, i64.Value),
!eq(SrcVT.Value, v2i32.Value));
}

class isPackedType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value));
!eq(SrcVT.Value, v4f16.Value),
!eq(SrcVT.Value, v2f32.Value));
}


//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -822,6 +829,10 @@ def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
}]>;

def extract_sccb : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 4) & 1, SDLoc(N), MVT::i8);
}]>;

//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1097,6 +1108,9 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;

def SCCB : NamedOperandBit<"SCCB", NamedMatchClass<"SCCB">>;
def SCCB_0 : NamedOperandBit_0<"SCCB", NamedMatchClass<"SCCB">>;

def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;

Expand Down Expand Up @@ -1243,7 +1257,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
let PredicateMethod = "isVReg32";
let PredicateMethod = "isVRegWithInputMods";
}

def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
Expand All @@ -1270,7 +1284,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
let PredicateMethod = "isVReg32";
let PredicateMethod = "isVRegWithInputMods";
}

def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
Expand Down Expand Up @@ -1507,8 +1521,12 @@ class getVOP3SrcForVT<ValueType VT> {
VSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
VSrc_f64,
VSrc_b64),
!if(!eq(VT.Value, v2f32.Value),
VSrc_v2f32,
VSrc_f64),
!if(!eq(VT.Value, v2i32.Value),
VSrc_v2b32,
VSrc_b64)),
!if(!eq(VT.Value, i1.Value),
SSrc_i1,
!if(isFP,
Expand Down Expand Up @@ -1541,7 +1559,9 @@ class isModifierType<ValueType SrcVT> {
!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v2i16.Value));
!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f32.Value),
!eq(SrcVT.Value, v2i32.Value));
}

// Return type of input modifiers operand for specified input operand
Expand Down Expand Up @@ -1972,14 +1992,29 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
string ret = dst#args#sdwa;
}

class getHas64BitOps <int NumSrcArgs, ValueType DstVT, ValueType Src0VT,
ValueType Src1VT> {
bit ret = !if(!eq(NumSrcArgs, 3),
0,
!if(!eq(DstVT.Size, 64),
1,
!if(!eq(Src0VT.Size, 64),
1,
!if(!eq(Src1VT.Size, 64),
1,
0
)
)
)
);
}

// Function that checks if instruction supports DPP and SDWA
class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !if(!eq(NumSrcArgs, 3),
0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
0, // NumSrcArgs == 3 - No SDWA for VOP3
!if(!eq(DstVT.Size, 64),
0, // 64-bit dst - No DPP or SDWA for 64-bit operands
0, // 64-bit dst - No SDWA for 64-bit operands
!if(!eq(Src0VT.Size, 64),
0, // 64-bit src0
!if(!eq(Src1VT.Size, 64),
Expand All @@ -1993,8 +2028,42 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,

class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !if(!eq(NumSrcArgs, 0), 0,
getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
bit ret = !if(!eq(NumSrcArgs, 3),
0, // NumSrcArgs == 3 - No DPP for VOP3
1);
}

class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
}

// Function that checks if instruction supports DPP and SDWA
class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
}

// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
RegisterOperand ret =
!if(!eq(RC.Size, 32), AVLdSt_32,
!if(!eq(RC.Size, 64), AVLdSt_64,
!if(!eq(RC.Size, 96), AVLdSt_96,
!if(!eq(RC.Size, 128), AVLdSt_128,
!if(!eq(RC.Size, 160), AVLdSt_160,
RegisterOperand<VReg_1> // invalid register
)))));
}

class BitOr<bit a, bit b> {
bit ret = !if(a, 1, !if(b, 1, 0));
}

class BitAnd<bit a, bit b> {
bit ret = !if(a, !if(b, 1, 0), 0);
}

def PatGenMode {
Expand Down Expand Up @@ -2077,8 +2146,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,

field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = HasExt;
field bit HasExtSDWA9 = HasExt;
field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA9 = HasExtSDWA;
field int NeedPatGen = PatGenMode.NoPattern;

field bit IsMAI = 0;
Expand Down Expand Up @@ -2144,6 +2214,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
let HasExtDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
Expand Down Expand Up @@ -2191,6 +2262,7 @@ def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>;

def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
Expand Down Expand Up @@ -2234,6 +2306,16 @@ def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;

def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>;
def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>;

def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>;
def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>;
def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>;
def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;

class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
Expand Down Expand Up @@ -2372,7 +2454,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
[!cast<string>(SIEncodingFamily.SDWA10)]];
[!cast<string>(SIEncodingFamily.SDWA10)],
[!cast<string>(SIEncodingFamily.GFX90A)]];
}

// Get equivalent SOPK instruction.
Expand Down
62 changes: 52 additions & 10 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,21 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
(i32 timm:$attrchan), (i32 timm:$attr), M0))]
>;

let OtherPredicates = [has32BankLDS] in {
let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in {

defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;

} // End OtherPredicates = [has32BankLDS]
} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus]

let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
let OtherPredicates = [has16BankLDS, isNotGFX90APlus],
Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {

defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;

} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus],
// Constraints = "@earlyclobber $vdst", isAsmParserOnly=1

let OtherPredicates = [isNotGFX90APlus] in {
let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {

defm V_INTERP_P2_F32 : VINTRP_m <
Expand All @@ -73,6 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
[(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;

} // End OtherPredicates = [isNotGFX90APlus]

} // End Uses = [MODE, M0, EXEC]

//===----------------------------------------------------------------------===//
Expand All @@ -86,11 +91,6 @@ def ATOMIC_FENCE : SPseudoInstSI<
let maybeAtomic = 1;
}

def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
let HasExt = 1;
let HasExtDPP = 1;
}

let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {

// For use in patterns
Expand All @@ -107,7 +107,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;

// 64-bit vector move with dpp. Expanded post-RA.
def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
}

Expand Down Expand Up @@ -1373,6 +1373,19 @@ def : GCNPat <
// sub1)
// >;

// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
// of the real value.
def : GCNPat <
(fneg (v2f32 SReg_64:$src)),
(v2f32 (REG_SEQUENCE SReg_64,
(f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
(i32 (S_MOV_B32 (i32 0x80000000)))),
SReg_32)), sub0,
(f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
(i32 (S_MOV_B32 (i32 0x80000000)))),
SReg_32)), sub1))
>;

} // End let AddedComplexity = 1

def : GCNPat <
Expand Down Expand Up @@ -1437,6 +1450,15 @@ def : GCNPat <
sub1)
>;

def : GCNPat <
(getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)),
(V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0,
0, 0, 0, 0, 0)
> {
let SubtargetPredicate = HasPackedFP32Ops;
}

def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
Expand Down Expand Up @@ -1556,9 +1578,16 @@ def : GCNPat <
/********** Intrinsic Patterns **********/
/********** ================== **********/

let OtherPredicates = [isNotGFX90APlus] in
// FIXME: Should use _e64 and select source modifiers.
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;

let OtherPredicates = [isGFX90APlus] in
def : GCNPat <
(fpow f32:$src0, f32:$src1),
(V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0))
>;

def : GCNPat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
Expand Down Expand Up @@ -2167,6 +2196,17 @@ def : GCNPat <
SRCMODS.NONE, $src2)
>;

let SubtargetPredicate = isGFX90APlus in
def : GCNPat <
(fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
(f64 (VOP3NoMods f64:$src2))),
(V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
SRCMODS.NONE, $src2, $clamp, $omod)
>;

// COPY is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
Expand Down Expand Up @@ -2652,6 +2692,8 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;

def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
Expand Down
90 changes: 76 additions & 14 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool GLC;
bool SLC;
bool DLC;
bool SCCB; // vmem only.
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
Expand Down Expand Up @@ -199,6 +200,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
const CombineInfo &Paired);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired);
const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;

bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
SmallVectorImpl<MachineInstr *> &InstsToMove);
Expand Down Expand Up @@ -304,6 +306,16 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B32_gfx9:
return 1;
case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B64_gfx9:
return 2;
default:
return 0;
}
Expand Down Expand Up @@ -526,6 +538,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
}
DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
if (InstClass != S_BUFFER_LOAD_IMM) {
SCCB = TII.getNamedOperand(*I, AMDGPU::OpName::sccb)->getImm();
}
}

AddressRegs Regs = getRegs(Opc, TII);
Expand Down Expand Up @@ -784,7 +799,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
(CI.InstClass == S_BUFFER_LOAD_IMM ||
(CI.SLC == Paired.SLC && CI.SCCB == Paired.SCCB));
}

// If the offset in elements doesn't fit in 8-bits, we might be able to use
Expand Down Expand Up @@ -864,6 +880,26 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}

const TargetRegisterClass *
SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
return TRI->getRegClassForReg(*MRI, Dst->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
return TRI->getRegClassForReg(*MRI, Src->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
return TRI->getRegClassForReg(*MRI, Src->getReg());
}
if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
return TRI->getRegClassForReg(*MRI, Dst->getReg());
}
if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
return TRI->getRegClassForReg(*MRI, Src->getReg());
}
return nullptr;
}

/// This function assumes that CI comes before Paired in a basic block.
bool SILoadStoreOptimizer::checkAndPrepareMerge(
CombineInfo &CI, CombineInfo &Paired,
Expand Down Expand Up @@ -896,6 +932,9 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);

const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
bool IsAGPR = TRI->hasAGPRs(DataRC);

MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
Expand Down Expand Up @@ -964,6 +1003,17 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
continue;

if (&*MBBI == &*Paired.I) {
if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
return false;
// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
// operands. However we are reporting that ds_write2 shall have
// only VGPR data so that machine copy propagation does not
// create an illegal instruction with a VGPR and AGPR sources.
// Consequenctially if we create such instruction the verifier
// will complain.
if (IsAGPR && CI.InstClass == DS_WRITE)
return false;

// We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
Expand Down Expand Up @@ -1037,8 +1087,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

const MCInstrDesc &Read2Desc = TII->get(Opc);

const TargetRegisterClass *SuperRC =
(CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);

DebugLoc DL = CI.I->getDebugLoc();
Expand Down Expand Up @@ -1317,6 +1366,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(0) // tfe
.addImm(CI.DLC) // dlc
.addImm(0) // swz
.addImm(CI.SCCB) // scc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
Expand Down Expand Up @@ -1384,6 +1434,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(0) // tfe
.addImm(CI.DLC) // dlc
.addImm(0) // swz
.addImm(CI.SCCB) // scc
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

Expand Down Expand Up @@ -1464,6 +1515,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
.addImm(0) // tfe
.addImm(CI.DLC) // dlc
.addImm(0) // swz
.addImm(CI.SCCB) // scc
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

Expand Down Expand Up @@ -1559,18 +1611,27 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
case 16:
return &AMDGPU::SGPR_512RegClass;
}
} else {
switch (CI.Width + Paired.Width) {
default:
return nullptr;
case 2:
return &AMDGPU::VReg_64RegClass;
case 3:
return &AMDGPU::VReg_96RegClass;
case 4:
return &AMDGPU::VReg_128RegClass;
}
}
const TargetRegisterClass *RC = nullptr;

switch (CI.Width + Paired.Width) {
default:
return nullptr;
case 2:
RC = &AMDGPU::VReg_64RegClass;
break;
case 3:
RC = &AMDGPU::VReg_96RegClass;
break;
case 4:
RC = &AMDGPU::VReg_128RegClass;
break;
}

if (TRI->hasAGPRs(getDataRegClass(*CI.I)))
return TRI->getEquivalentAGPRClass(RC);

return RC;
}

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
Expand Down Expand Up @@ -1624,6 +1685,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
.addImm(0) // tfe
.addImm(CI.DLC) // dlc
.addImm(0) // swz
.addImm(CI.SCCB) // scc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));

moveInstsAfter(MIB, InstsToMove);
Expand Down
409 changes: 409 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIProgramInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,13 @@ struct SIProgramInfo {
uint32_t ScratchBlocks = 0;

uint64_t ComputePGMRSrc2 = 0;
uint64_t ComputePGMRSrc3GFX90A = 0;

uint32_t NumVGPR = 0;
uint32_t NumArchVGPR = 0;
uint32_t NumAccVGPR = 0;
uint32_t AccumOffset = 0;
uint32_t TgSplit = 0;
uint32_t NumSGPR = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
Expand Down
41 changes: 38 additions & 3 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
return CSR_AMDGPU_HighRegs_SaveList;
return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
: CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
Expand All @@ -143,7 +145,9 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
return CSR_AMDGPU_HighRegs_RegMask;
return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
: CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
}
Expand Down Expand Up @@ -181,6 +185,14 @@ const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
return CSR_AMDGPU_AllVGPRs_RegMask;
}

const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
return CSR_AMDGPU_AllAGPRs_RegMask;
}

const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
return CSR_AMDGPU_AllVectorRegs_RegMask;
}

const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
Expand Down Expand Up @@ -263,6 +275,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}

unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
// TODO: In an entry function without calls and AGPRs used it is possible
// to use the whole register budget for VGPRs. Even more it shall
// be possible to estimate maximum AGPR/VGPR pressure and split
// register file accordingly.
if (ST.hasGFX90AInsts())
MaxNumVGPRs /= 2;
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
Expand Down Expand Up @@ -327,6 +345,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}

if (ST.hasGFX90AInsts())
for (const TargetRegisterClass *RC : this->regclasses())
if (getRegSizeInBits(*RC) > 32 && hasVectorRegisters(RC))
for (unsigned Reg : *RC)
if (getEncodingValue(Reg) & 1)
Reserved.set(Reg);

// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);
Expand Down Expand Up @@ -730,6 +755,7 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addImm(0) // scc
.cloneMemRefs(*MI);

const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
Expand Down Expand Up @@ -798,7 +824,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MCRegister SOffset = ScratchOffsetReg;

const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
const bool IsAGPR = hasAGPRs(RC);
// On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;

// Always use 4 byte operations for AGPRs because we need to scavenge
Expand Down Expand Up @@ -996,6 +1023,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (!IsFlat)
MIB.addImm(0) // dlc
.addImm(0); // swz
MIB.addImm(0); // scc
MIB.addMemOperand(NewMMO);

if (!IsAGPR && NeedSuperRegDef)
Expand Down Expand Up @@ -2055,6 +2083,13 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned DstSize = getRegSizeInBits(*DstRC);
unsigned NewSize = getRegSizeInBits(*NewRC);

// Do not allow coalescing between an odd and an even lanes as it will
// result in misaligned tuple access.
if (ST.hasGFX90AInsts() && !isSGPRClass(NewRC) &&
(getChannelFromSubReg(DstSubReg) & 1) !=
(getChannelFromSubReg(SubReg) & 1))
return false;

// Do not increase size of registers beyond dword, we would need to allocate
// adjacent registers and constraint regalloc more than needed.

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
LiveIntervals *LIS) const;

const uint32_t *getAllVGPRRegMask() const;
const uint32_t *getAllAGPRRegMask() const;
const uint32_t *getAllVectorRegMask() const;
const uint32_t *getAllAllocatableSRegMask() const;

// \returns number of 32 bit registers covered by a \p LM
Expand Down
91 changes: 84 additions & 7 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -847,21 +847,36 @@ def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let isAllocatable = 0;
}

def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
}

def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
(add AGPR_32, VGPR_32)> {
let isAllocatable = 0;
}

def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
(add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
} // End GeneratePressureSet = 0

def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
(add AReg_96, VReg_96)> {
let isAllocatable = 0;
}

def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
(add AReg_128, VReg_128)> {
let isAllocatable = 0;
}

def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
(add AReg_160, VReg_160)> {
let isAllocatable = 0;
}

//===----------------------------------------------------------------------===//
// Register operands
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -912,21 +927,38 @@ multiclass SIRegOperand32 <string rc, string MatchName, string opType,
}
}

multiclass SIRegOperand <string rc, string MatchName, string opType> :
SIRegOperand32<rc, MatchName, opType> {
multiclass SIRegOperand64 <string rc, string MatchName, string opType,
string rc_suffix = "_64", bit Vectors = 1> {
let OperandNamespace = "AMDGPU" in {
def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT64";
let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
}

def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP64";
let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
}

foreach _ = BoolToList<Vectors>.ret in
def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2FP32";
let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">;
let DecoderMethod = "decodeOperand_VSrcV232";
}
foreach _ = BoolToList<Vectors>.ret in
def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2INT32";
let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">;
let DecoderMethod = "decodeOperand_VSrcV232";
}
}
}

multiclass SIRegOperand <string rc, string MatchName, string opType> :
SIRegOperand32<rc, MatchName, opType>,
SIRegOperand64<rc, MatchName, opType>;

// FIXME: 64-bit sources can sometimes use 32-bit constants.
multiclass RegImmOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
Expand All @@ -938,10 +970,18 @@ multiclass RegInlineOperand32 <string rc, string MatchName,
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;

multiclass RegInlineOperand64 <string rc, string MatchName,
string rc_suffix = "_64">
: SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;

multiclass RegInlineOperandAC <string rc, string MatchName,
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;

multiclass RegInlineOperandAC64 <string rc, string MatchName,
string rc_suffix = "_64">
: SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>;

//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1001,6 +1041,13 @@ defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
//===----------------------------------------------------------------------===//

defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
let DecoderMethod = "decodeOperand_VReg_64" in
defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">;
defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">;
let DecoderMethod = "decodeOperand_VReg_256" in
defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">;
defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">;
defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;

//===----------------------------------------------------------------------===//
// AVSrc_* Operands with an AGPR or VGPR
Expand All @@ -1016,6 +1063,31 @@ def AVSrc_64 : RegisterOperand<AV_64> {
let EncoderMethod = "getAVOperandEncoding";
}

def AVLdSt_32 : RegisterOperand<AV_32> {
let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
}

def AVLdSt_64 : RegisterOperand<AV_64> {
let DecoderMethod = "DecodeAVLdSt_64RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
}

def AVLdSt_96 : RegisterOperand<AV_96> {
let DecoderMethod = "DecodeAVLdSt_96RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
}

def AVLdSt_128 : RegisterOperand<AV_128> {
let DecoderMethod = "DecodeAVLdSt_128RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
}

def AVLdSt_160 : RegisterOperand<AV_160> {
let DecoderMethod = "DecodeAVLdSt_160RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
}

//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
Expand All @@ -1024,3 +1096,8 @@ defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">;
defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;

let DecoderMethod = "decodeOperand_AReg_64" in
defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">;
let DecoderMethod = "decodeOperand_AReg_256" in
defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">;
66 changes: 50 additions & 16 deletions llvm/lib/Target/AMDGPU/SISchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,15 @@ def WriteTrans64 : SchedWrite;
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;

// Integer multiplications.
def WriteIntMul : SchedWrite;

// mAI multipass instructions.
def Write2PassMAI : SchedWrite;
def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;

// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
Expand All @@ -80,6 +85,7 @@ class SISchedMachineModel : SchedMachineModel {

def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def SIDPFullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;

// XXX: Are the resource counts correct?
Expand Down Expand Up @@ -137,11 +143,13 @@ multiclass SICommonWriteRes {
def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???

def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteFloatCvt, 4>;
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;

def : HWVALUWriteRes<Write4PassDGEMM, 4>;
def : HWVALUWriteRes<Write8PassDGEMM, 16>;

let ResourceCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
let ResourceCycles = [8] in
Expand All @@ -150,7 +158,6 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;

def : ReadAdvance<MIVGPRRead, -2>;
def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;

// Technically mfma reads can be from 0 to 4 cycles but that does not make
// sense to model because its register setup is huge. In particular if we
Expand All @@ -159,10 +166,6 @@ multiclass SICommonWriteRes {
// need to consume 2 or 4 more vgprs to be initialized before the acc
// write sequence. Just assume worst case here.
def : ReadAdvance<MIMFMARead, -4>;

def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}

def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
Expand All @@ -176,11 +179,13 @@ let SchedModel = SIFullSpeedModel in {

defm : SICommonWriteRes;

def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteIntMul, 4>;
def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : HWVALUWriteRes<WriteTrans64, 4>;

def : InstRW<[WriteCopy], (instrs COPY)>;

Expand All @@ -190,16 +195,44 @@ let SchedModel = SIQuarterSpeedModel in {

defm : SICommonWriteRes;

def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : HWVALUWriteRes<WriteTrans64, 16>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteIntMul, 4>;
def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : HWVALUWriteRes<WriteTrans64, 16>;

def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;

} // End SchedModel = SIQuarterSpeedModel

let SchedModel = SIDPFullSpeedModel in {

defm : SICommonWriteRes;

def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 1>;
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
def : HWVALUWriteRes<WriteDoubleCvt, 1>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<WriteIntMul, 1>;
def : HWVALUWriteRes<Write64Bit, 1>;

def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>;
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>;
def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;

} // End SchedModel = SIDPFullSpeedModel

let SchedModel = GFX10SpeedModel in {

// The latency values are 1 / (operations / cycle).
Expand All @@ -213,6 +246,7 @@ def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>;

def : HWWriteRes<WriteBranch, [HWBranch], 32>;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
char Flags = 0;

if (TII->isWQM(Opcode)) {
// If LOD is not supported WQM is not needed.
if (!ST->hasExtendedImageInsts())
continue;
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
Expand Down
28 changes: 27 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
<< "gfx"
<< Version.Major
<< Version.Minor
<< Version.Stepping;
<< hexdigit(Version.Stepping, true);

if (hasXNACK(*STI))
Stream << "+xnack";
Expand Down Expand Up @@ -402,6 +402,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {

unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
if (isGFX90A(*STI))
return 8;
if (!isGFX10Plus(*STI))
return 10;
return hasGFX10_3Insts(*STI) ? 16 : 20;
Expand Down Expand Up @@ -531,6 +533,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {

unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;

bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
Expand All @@ -543,6 +548,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,

unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;

bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
Expand All @@ -552,12 +559,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}

unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}

unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
return 256;
}

Expand Down Expand Up @@ -653,6 +664,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
}
if (AMDGPU::isGFX90A(*STI)) {
AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
}
return KD;
}

Expand Down Expand Up @@ -1267,6 +1283,10 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
}

bool isGFX90A(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
}

bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
Expand Down Expand Up @@ -1374,6 +1394,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return true;
default:
return false;
Expand Down Expand Up @@ -1418,16 +1441,19 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
case AMDGPU::AReg_96RegClassID:
case AMDGPU::AV_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
case AMDGPU::AReg_128RegClassID:
case AMDGPU::AV_128RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
case AMDGPU::AReg_160RegClassID:
case AMDGPU::AV_160RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI);

/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
Expand Down Expand Up @@ -746,12 +747,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return 4;

case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return 8;

case AMDGPU::OPERAND_REG_IMM_INT16:
Expand Down Expand Up @@ -847,6 +853,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
const GCNSubtarget *Subtarget,
Align Alignment = Align(4));

LLVM_READNONE
inline bool isLegal64BitDPPControl(unsigned DC) {
return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
}

/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);

Expand Down
18 changes: 16 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,12 @@ let SubtargetPredicate = isGFX6GFX7 in {
VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
} // End TRANS = 1, SchedRW = [WriteTrans32]

let SchedRW = [WriteDouble] in {
let SchedRW = [WriteTrans64] in {
defm V_RCP_CLAMP_F64 :
VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 :
VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
} // End SchedRW = [WriteDouble]
} // End SchedRW = [WriteTrans64]
} // End SubtargetPredicate = isGFX6GFX7

let SubtargetPredicate = isGFX7GFX8GFX9 in {
Expand Down Expand Up @@ -461,6 +461,18 @@ let SubtargetPredicate = isGFX10Plus in {
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus

def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> {
let DstRC = RegisterOperand<AGPR_32>;
let Src0RC32 = RegisterOperand<AGPR_32>;
let Asm32 = " $vdst, $src0";
}

def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> {
let SubtargetPredicate = isGFX90APlus;
let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
}

//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -823,6 +835,8 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>;
defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>;
defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;

defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>;

// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
Expand Down
63 changes: 56 additions & 7 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -289,28 +289,30 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F32 : VOP_MADMK <f32>;

class getRegisterOperandForVT<ValueType VT> {
RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
}

// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));

let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp8:$dpp8, FI:$fi);

let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
VGPR_32:$src2, // stub argument
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
Expand All @@ -335,6 +337,8 @@ def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
let HasExtSDWA = 0, HasExt64BitDPP = 1 in
def VOP_MAC_F64 : VOP_MAC <f64>;

class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
let HasClamp = 0;
Expand Down Expand Up @@ -448,6 +452,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {

let HasExt = 0;
let HasExtDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
Expand All @@ -464,6 +469,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {

let HasExt = 0;
let HasExtDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
Expand Down Expand Up @@ -692,6 +698,14 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;

} // End SubtargetPredicate = HasFmaLegacy32

let SubtargetPredicate = isGFX90APlus,
Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1,
SchedRW = [WriteDoubleAdd] in
defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;

let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
Expand Down Expand Up @@ -1525,6 +1539,7 @@ defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
let AssemblerPredicate = isGCN3ExcludingGFX90A in
defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
Expand Down Expand Up @@ -1641,6 +1656,40 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;

} // End SubtargetPredicate = HasDLInsts

let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
multiclass VOP2_Real_e32_gfx90a <bits<6> op> {
def _e32_gfx90a :
VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>,
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
}

multiclass VOP2_Real_e64_gfx90a <bits<10> op> {
def _e64_gfx90a :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}

multiclass Base_VOP2_Real_e32e64_gfx90a <bits<6> op> :
VOP2_Real_e32_gfx90a<op>,
VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>;

multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> :
Base_VOP2_Real_e32e64_gfx90a<op> {

foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx90a :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA9";
}
}
} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A"

let SubtargetPredicate = isGFX90APlus in {
defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>;
defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
} // End SubtargetPredicate = isGFX90APlus

multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
Expand Down
31 changes: 16 additions & 15 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,12 @@ defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_l
defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]

let SchedRW = [WriteQuarterRate32] in {
let SchedRW = [WriteIntMul] in {
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteQuarterRate32]
} // End SchedRW = [WriteIntMul]

let Uses = [MODE, VCC, EXEC] in {
// v_div_fmas_f32:
Expand Down Expand Up @@ -447,10 +447,10 @@ defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]

let isCommutable = 1 in {
let SchedRW = [WriteQuarterRate32, WriteSALU] in {
let SchedRW = [WriteIntMul, WriteSALU] in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteQuarterRate32, WriteSALU]
} // End SchedRW = [WriteIntMul, WriteSALU]
} // End isCommutable = 1

} // End SubtargetPredicate = isGFX7Plus
Expand All @@ -476,6 +476,7 @@ let renamedInGFX9 = 1 in {
let FPDPRounding = 1 in {
defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [MODE, M0, EXEC] in {
let OtherPredicates = [isNotGFX90APlus] in
// For some reason the intrinsic operands are in a different order
// from the instruction operands.
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
Expand All @@ -497,24 +498,24 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
let OtherPredicates = [isNotGFX90APlus] in
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus

let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
// This predicate should only apply to the selection pattern. The
// instruction still exists and should decode on subtargets with
// other bank counts.
let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
[(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
(i32 timm:$attrchan),
(i32 timm:$attr),
(i1 timm:$high), M0))]> {
// This predicate should only apply to the selection pattern. The
// instruction still exists and should decode on subtargets with
// other bank counts.
let OtherPredicates = [has32BankLDS];
}

(i1 timm:$high), M0))]>;
} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1

let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1

} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1

Expand All @@ -527,11 +528,11 @@ def : GCNPat<
), VGPR_32)), sub1)
>;

let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]

let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {

Expand Down
Loading