Skip to content

Commit

Permalink
[AMDGPU][True16] Fix ISel for A16 Image Instructions
Browse files Browse the repository at this point in the history
The 16-bit VAddr arguments to A16 image instructions are packed into
legal VGPR_32 operands in AMDGPULegalizerInfo::legalizeImageIntrinsic on
all subtargets. With True16, we also need to pack if the number of VAddr is one
because VGPR_16 is not a legal argument to those Image instructions.

No change to emitted code intended on subtargets pre-GFX11, and none on GFX11
until True16 is active.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D157426
  • Loading branch information
Sisyph committed Aug 11, 2023
1 parent 5820c92 commit 2fb4bfa
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 114 deletions.
75 changes: 37 additions & 38 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5891,49 +5891,48 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();

if (IsA16 || IsG16) {
if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
// Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
// instructions expect VGPR_32
SmallVector<Register, 4> PackedRegs;

packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
IsG16);

// See also below in the non-a16 branch
const bool UseNSA = ST.hasNSAEncoding() &&
PackedRegs.size() >= ST.getNSAThreshold(MF) &&
(PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
const bool UsePartialNSA =
UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;

if (UsePartialNSA) {
// Pack registers that would go over NSAMaxSize into last VAddr register
LLT PackedAddrTy =
LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
auto Concat = B.buildConcatVectors(
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
PackedRegs.resize(NSAMaxSize);
} else if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
}
packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);

const unsigned NumPacked = PackedRegs.size();
for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg()) {
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
continue;
}
// See also below in the non-a16 branch
const bool UseNSA = ST.hasNSAEncoding() &&
PackedRegs.size() >= ST.getNSAThreshold(MF) &&
(PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
const bool UsePartialNSA =
UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;

assert(SrcOp.getReg() != AMDGPU::NoRegister);
if (UsePartialNSA) {
// Pack registers that would go over NSAMaxSize into last VAddr register
LLT PackedAddrTy =
LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
auto Concat = B.buildConcatVectors(
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
PackedRegs.resize(NSAMaxSize);
} else if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
}

if (I - Intr->VAddrStart < NumPacked)
SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
else
SrcOp.setReg(AMDGPU::NoRegister);
const unsigned NumPacked = PackedRegs.size();
for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg()) {
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
continue;
}

assert(SrcOp.getReg() != AMDGPU::NoRegister);

if (I - Intr->VAddrStart < NumPacked)
SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
else
SrcOp.setReg(AMDGPU::NoRegister);
}
} else {
// If the register allocator cannot place the address registers contiguously
Expand Down

0 comments on commit 2fb4bfa

Please sign in to comment.