Skip to content

Commit

Permalink
[AMDGPU][GFX11] Legalize and select partial NSA MIMG instructions
Browse files Browse the repository at this point in the history
If more registers are needed for VAddr then the NSA format allows then the
final register can act as a contigous set of remaining addresses. Update
legalizer to pack register for this new format and allow instruction
selection to use NSA encoding when number of addresses exceeds max size.
Also update SIShrinkInstructions to handle partial NSA.

Differential Revision: https://reviews.llvm.org/D144034
  • Loading branch information
mbrkusanin committed Feb 23, 2023
1 parent b3dc0e6 commit 926746d
Show file tree
Hide file tree
Showing 17 changed files with 2,364 additions and 100 deletions.
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Expand Up @@ -1859,7 +1859,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
// The legalizer preprocessed the intrinsic arguments. If we aren't using
// NSA, these should have been packed into a single value in the first
// address register
const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
const bool UseNSA =
NumVAddrRegs != 1 &&
(STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
: NumVAddrDwords == NumVAddrRegs);
if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
return false;
Expand Down
37 changes: 28 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -4997,6 +4997,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return false;
}

const unsigned NSAMaxSize = ST.getNSAMaxSize();
const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();

if (IsA16 || IsG16) {
if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
Expand All @@ -5007,9 +5010,19 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// See also below in the non-a16 branch
const bool UseNSA = ST.hasNSAEncoding() &&
PackedRegs.size() >= ST.getNSAThreshold(MF) &&
PackedRegs.size() <= ST.getNSAMaxSize();

if (!UseNSA && PackedRegs.size() > 1) {
(PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
const bool UsePartialNSA =
UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;

if (UsePartialNSA) {
// Pack registers that would go over NSAMaxSize into last VAddr register
LLT PackedAddrTy =
LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
auto Concat = B.buildConcatVectors(
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
PackedRegs.resize(NSAMaxSize);
} else if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
Expand Down Expand Up @@ -5045,16 +5058,22 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
// TODO: we can actually allow partial NSA where the final register is a
// contiguous set of the remaining addresses.
// This could help where there are more addresses than supported.
// Partial NSA is allowed on GFX11 where the final register is a contiguous
// set of the remaining addresses.
const bool UseNSA = ST.hasNSAEncoding() &&
CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
CorrectedNumVAddrs <= ST.getNSAMaxSize();

if (!UseNSA && Intr->NumVAddrs > 1)
(CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
const bool UsePartialNSA =
UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;

if (UsePartialNSA) {
convertImageAddrToPacked(B, MI,
ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
Intr->NumVAddrs - NSAMaxSize + 1);
} else if (!UseNSA && Intr->NumVAddrs > 1) {
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
Intr->NumVAddrs);
}
}

int Flags = 0;
Expand Down
29 changes: 21 additions & 8 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -6581,15 +6581,24 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
// TODO: we can actually allow partial NSA where the final register is a
// contiguous set of the remaining addresses.
// This could help where there are more addresses than supported.
bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) &&
VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
// Partial NSA is allowed on GFX11 where the final register is a contiguous
// set of the remaining addresses.
const unsigned NSAMaxSize = ST->getNSAMaxSize();
const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
const bool UseNSA = ST->hasNSAEncoding() &&
VAddrs.size() >= ST->getNSAThreshold(MF) &&
(VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
const bool UsePartialNSA =
UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;

SDValue VAddr;
if (!UseNSA)
if (UsePartialNSA) {
VAddr = getBuildDwordsVector(DAG, DL,
ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
}
else if (!UseNSA) {
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
}

SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
Expand Down Expand Up @@ -6657,7 +6666,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
if (UseNSA)
if (UsePartialNSA) {
append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
Ops.push_back(VAddr);
}
else if (UseNSA)
append_range(Ops, VAddrs);
else
Ops.push_back(VAddr);
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -4636,9 +4636,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
unsigned VAddrWords;
if (IsNSA) {
VAddrWords = SRsrcIdx - VAddr0Idx;
if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) {
unsigned LastVAddrIdx = SRsrcIdx - 1;
VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
}
} else {
const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
if (AddrWords > 12)
AddrWords = 16;
}
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Expand Up @@ -308,7 +308,10 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
unsigned NextVgpr = 0;
bool IsUndef = true;
bool IsKill = NewAddrDwords == Info->VAddrDwords;
for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
const unsigned NSAMaxSize = ST->getNSAMaxSize();
const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
Expand Down Expand Up @@ -361,13 +364,13 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
MI.getOperand(VAddr0Idx).setIsKill(IsKill);

for (int i = 1; i < Info->VAddrOperands; ++i)
for (unsigned i = 1; i < EndVAddr; ++i)
MI.removeOperand(VAddr0Idx + 1);

if (ToUntie >= 0) {
MI.tieOperands(
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
ToUntie - (Info->VAddrOperands - 1));
ToUntie - (EndVAddr - 1));
}
}

Expand Down

0 comments on commit 926746d

Please sign in to comment.