Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
[AMDGPU] gfx908 mfma support
Differential Revision: https://reviews.llvm.org/D64584

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365824 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
rampitec committed Jul 11, 2019
1 parent fba8fc2 commit 6644a1885fccc43708cf4486b7f31a9168826ca4
@@ -647,6 +647,8 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
return AMDGPU::SReg_256RegClassID;
case 16:
return AMDGPU::SReg_512RegClassID;
case 32:
return AMDGPU::SReg_1024RegClassID;
}

llvm_unreachable("invalid vector size");
@@ -665,12 +667,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
return;
}

assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
// 16 = Max Num Vector Elements
// 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);

RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
@@ -355,6 +355,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -103,5 +103,27 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;

def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;

foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
@@ -365,6 +365,9 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
continue;

unsigned R = Op.getReg();
if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
continue;

unsigned ShiftedBank = Bank;

if (Bank != -1 && R == Reg && Op.getSubReg()) {
@@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg,
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
(STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
(STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
STI->hasAGPRs(RC) ?
(STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) :
(STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
}

void GCNRegPressure::inc(unsigned Reg,
@@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg,
switch (auto Kind = getRegKind(Reg, MRI)) {
case SGPR32:
case VGPR32:
case AGPR32:
assert(PrevMask.none() && NewMask == MaxMask);
Value[Kind] += Sign;
break;

case SGPR_TUPLE:
case VGPR_TUPLE:
case AGPR_TUPLE:
assert(NewMask < MaxMask || NewMask == MaxMask);
assert(PrevMask < NewMask);

Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] +=
Sign * (~PrevMask & NewMask).getNumLanes();

if (PrevMask.none()) {
@@ -31,6 +31,8 @@ struct GCNRegPressure {
SGPR_TUPLE,
VGPR32,
VGPR_TUPLE,
AGPR32,
AGPR_TUPLE,
TOTAL_KINDS
};

@@ -43,9 +45,10 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }

unsigned getSGPRNum() const { return Value[SGPR32]; }
unsigned getVGPRNum() const { return Value[VGPR32]; }
unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }

unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
Value[AGPR_TUPLE]); }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }

unsigned getOccupancy(const GCNSubtarget &ST) const {
@@ -143,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
return new SIFixSGPRCopies();
}

static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
static bool hasVectorOperands(const MachineInstr &MI,
const SIRegisterInfo *TRI) {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg() ||
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
continue;

if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
return true;
}
return false;
@@ -183,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
TRI.hasVGPRs(SrcRC);
TRI.hasVectorRegisters(SrcRC);
}

static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
TRI.hasVGPRs(DstRC);
TRI.hasVectorRegisters(DstRC);
}

static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -277,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
// VGPRz = REG_SEQUENCE VGPRx, sub0

MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
bool IsAGPR = TRI->hasAGPRs(DstRC);

for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();
@@ -295,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
TmpReg)
.add(MI.getOperand(I));

if (IsAGPR) {
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
TmpAReg)
.addReg(TmpReg, RegState::Kill);
TmpReg = TmpAReg;
}

MI.getOperand(I).setReg(TmpReg);
}

@@ -682,8 +695,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::REG_SEQUENCE:
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
!hasVGPROperands(MI, TRI)) {
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
}
@@ -698,7 +711,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI, MDT);
}
@@ -187,6 +187,7 @@ static bool updateOperand(FoldCandidate &Fold,

if (Fold.isImm()) {
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
!(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
ST.hasInv2PiInlineImm())) {
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
@@ -419,6 +420,71 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}

static bool tryToFoldACImm(const SIInstrInfo *TII,
const MachineOperand &OpToFold,
MachineInstr *UseMI,
unsigned UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList) {
const MCInstrDesc &Desc = UseMI->getDesc();
const MCOperandInfo *OpInfo = Desc.OpInfo;
if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
return false;

uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
return false;

if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
return true;
}

if (!OpToFold.isReg())
return false;

unsigned UseReg = OpToFold.getReg();
if (!TargetRegisterInfo::isVirtualRegister(UseReg))
return false;

if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
return FC.UseMI == UseMI; }) != FoldList.end())
return false;

MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
if (!Def || !Def->isRegSequence())
return false;

int64_t Imm;
MachineOperand *Op;
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
const MachineOperand &Sub = Def->getOperand(I);
if (!Sub.isReg() || Sub.getSubReg())
return false;
MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
while (SubDef && !SubDef->isMoveImmediate() &&
!SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
return false;
Op = &SubDef->getOperand(1);
auto SubImm = Op->getImm();
if (I == 1) {
if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
return false;

Imm = SubImm;
continue;
}
if (Imm != SubImm)
return false; // Can only fold splat constants
}

FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
return true;
}

void SIFoldOperands::foldOperand(
MachineOperand &OpToFold,
MachineInstr *UseMI,
@@ -462,6 +528,11 @@ void SIFoldOperands::foldOperand(
Next = std::next(RSUse);

MachineInstr *RSUseMI = RSUse->getParent();

if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
RSUse.getOperandNo(), FoldList))
continue;

if (RSUse->getSubReg() != RegSeqDstSubReg)
continue;

@@ -472,6 +543,9 @@ void SIFoldOperands::foldOperand(
return;
}

if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
return;

if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
// Sanity check that this is a stack access.
// FIXME: Should probably use stack pseudos before frame lowering.
@@ -505,7 +579,7 @@ void SIFoldOperands::foldOperand(
if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
TargetRegisterInfo::isVirtualRegister(SrcReg)) {
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
for (MachineRegisterInfo::use_iterator
@@ -523,6 +597,14 @@ void SIFoldOperands::foldOperand(
}
}

if (DestRC == &AMDGPU::AGPR_32RegClass &&
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
CopiesToReplace.push_back(UseMI);
return;
}

// In order to fold immediates into copies, we need to change the
// copy to a MOV.

@@ -535,14 +617,23 @@ void SIFoldOperands::foldOperand(
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {
unsigned Size = TII->getOpSize(*UseMI, 1);
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
if (Size != 4)
return;
if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
return;
}

0 comments on commit 6644a18

Please sign in to comment.