Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Fix asserting on gather4 intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenm committed Mar 17, 2020
1 parent a060a17 commit 039c917
Show file tree
Hide file tree
Showing 4 changed files with 2,917 additions and 28 deletions.
67 changes: 39 additions & 28 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -3376,30 +3376,36 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,

/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
/// vector with s16 typed elements.
static void packImageA16AddressToDwords(MachineIRBuilder &B,
MachineInstr &MI,
static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
SmallVectorImpl<Register> &PackedAddrs,
int DimIdx,
int NumVAddrs) {
int AddrIdx, int DimIdx, int NumVAddrs,
int NumGradients) {
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);

SmallVector<Register, 8> A16AddrRegs;
A16AddrRegs.resize(NumVAddrs);
for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
Register AddrReg = MI.getOperand(I).getReg();

for (int I = 0; I != NumVAddrs; ++I) {
A16AddrRegs[I] = MI.getOperand(DimIdx + I).getReg();
assert(B.getMRI()->getType(A16AddrRegs[I]) == S16);
}

// Round to dword.
if (NumVAddrs % 2 != 0)
A16AddrRegs.push_back(B.buildUndef(S16).getReg(0));

PackedAddrs.resize(A16AddrRegs.size() / 2);
for (int I = 0, E = PackedAddrs.size(); I != E; ++I) {
PackedAddrs[I] = B.buildBuildVector(
V2S16, {A16AddrRegs[2 * I], A16AddrRegs[2 * I + 1]}).getReg(0);
if (I < DimIdx) {
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
// derivatives dx/dh and dx/dv are packed with undef.
if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
((NumGradients / 2) % 2 == 1 &&
(I == DimIdx + (NumGradients / 2) - 1 ||
I == DimIdx + NumGradients - 1))) {
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
.getReg(0));
} else {
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
.getReg(0));
++I;
}
}
}
}

Expand All @@ -3419,15 +3425,18 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
}

static int getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
/// Return number of address arguments, and the number of gradients
static std::pair<int, int>
getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
const AMDGPU::MIMGDimInfo *DimInfo
= AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);

int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
return BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
return {NumVAddr, NumGradients};
}

static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
Expand Down Expand Up @@ -3495,7 +3504,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
const bool IsA16 = AddrTy == S16;

const int NumVAddrs = getImageNumVAddr(ImageDimIntr, BaseOpcode);
int NumVAddrs, NumGradients;
std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);

// If the register allocator cannot place the address registers contiguously
// without introducing moves, then using the non-sequential address encoding
Expand All @@ -3522,7 +3532,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(

if (NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
packImageA16AddressToDwords(B, MI, PackedRegs, DimIdx, NumVAddrs);
packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
NumGradients);

if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
Expand All @@ -3533,16 +3544,16 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(

const int NumPacked = PackedRegs.size();
for (int I = 0; I != NumVAddrs; ++I) {
assert(MI.getOperand(DimIdx + I).getReg() != AMDGPU::NoRegister);
assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister);

if (I < NumPacked)
MI.getOperand(DimIdx + I).setReg(PackedRegs[I]);
MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]);
else
MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister);
}
}
} else if (!UseNSA && NumVAddrs > 1) {
convertImageAddrToPacked(B, MI, DimIdx, NumVAddrs);
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
}

int DMaskLanes = 0;
Expand Down

0 comments on commit 039c917

Please sign in to comment.