109 changes: 81 additions & 28 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -235,22 +235,47 @@ def getMIMGInfo : SearchIndex {
let Key = ["Opcode"];
}

class NSAHelper {
dag AddrIns;
string AddrAsm;
int NSA;
}

// This class used to use !foldl to memoize the AddrAsmNames list.
// It turned out that that was much slower than using !filter.
class MIMGNSAHelper<int num_addrs,
list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> {
list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
: NSAHelper<> {
list<string> AddrAsmNames =
!foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
!lt(i, num_addrs)), "vaddr" # i);
dag AddrIns = !dag(ins, addr_types, AddrAsmNames);
string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
let AddrIns = !dag(ins, addr_types, AddrAsmNames);
let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";

int NSA = !if(!le(num_addrs, 1), ?,
let NSA = !if(!le(num_addrs, 1), ?,
!if(!le(num_addrs, 5), 1,
!if(!le(num_addrs, 9), 2,
!if(!le(num_addrs, 13), 3, ?))));
}

class PartialNSAHelper<int num_addrs, int max_addr, RegisterClass LastAddrRC>
: NSAHelper<> {

list<RegisterClass> addr_types =
!if(!ge(num_addrs, max_addr),
!listconcat(!listsplat(VGPR_32, !sub(max_addr, 1)), [LastAddrRC]),
!listsplat(VGPR_32, num_addrs));

int VAddrCount = !if(!gt(num_addrs, max_addr), max_addr, num_addrs);
list<string> AddrAsmNames =
!foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
!lt(i, VAddrCount)), "vaddr" # i);

let AddrIns = !dag(ins, addr_types, AddrAsmNames);
let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
let NSA = 1;
}

// Base class of all pre-gfx10 MIMG instructions.
class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx6789<op> {
Expand Down Expand Up @@ -321,17 +346,18 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
// Base class for all NSA MIMG instructions.
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
list<RegisterClass> addr_types=[]>
list<RegisterClass> addr_types=[],
RegisterClass LastAddrRC = VGPR_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
let SubtargetPredicate = isGFX11Plus;
let AssemblerPredicate = isGFX11Plus;

let MIMGEncoding = MIMGEncGfx11NSA;
let VAddrOperands = num_addrs;

MIMGNSAHelper nsah = !if(!empty(addr_types),
MIMGNSAHelper<num_addrs>,
MIMGNSAHelper<num_addrs, addr_types>);
NSAHelper nsah = !if(!empty(addr_types),
PartialNSAHelper<num_addrs, 5, LastAddrRC>,
MIMGNSAHelper<num_addrs, addr_types>);
dag AddrIns = nsah.AddrIns;
string AddrAsm = nsah.AddrAsm;

Expand Down Expand Up @@ -934,8 +960,9 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode,

class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
RegisterClass LastVAddrSize, string dns="">
: MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
LastVAddrSize> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
Dim:$dim, UNorm:$unorm, CPol:$cpol,
Expand All @@ -946,29 +973,34 @@ class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
#!if(BaseOpcode.HasD16, "$d16", "");
}

class MIMGAddrSize<int dw, bit enable_disasm> {
class MIMGAddrSize<int dw, bit enable_disasm, int AddrDW = dw> {
int NumWords = dw;

RegisterClass RegClass = !if(!le(NumWords, 0), ?,
!if(!eq(NumWords, 1), VGPR_32,
!if(!eq(NumWords, 2), VReg_64,
!if(!eq(NumWords, 3), VReg_96,
!if(!eq(NumWords, 4), VReg_128,
!if(!eq(NumWords, 5), VReg_160,
!if(!eq(NumWords, 6), VReg_192,
!if(!eq(NumWords, 7), VReg_224,
!if(!le(NumWords, 8), VReg_256,
!if(!le(NumWords, 9), VReg_288,
!if(!le(NumWords, 10), VReg_320,
!if(!le(NumWords, 11), VReg_352,
!if(!le(NumWords, 12), VReg_384,
!if(!le(NumWords, 16), VReg_512, ?))))))))))))));
RegisterClass RegClass = !if(!le(AddrDW, 0), ?,
!if(!eq(AddrDW, 1), VGPR_32,
!if(!eq(AddrDW, 2), VReg_64,
!if(!eq(AddrDW, 3), VReg_96,
!if(!eq(AddrDW, 4), VReg_128,
!if(!eq(AddrDW, 5), VReg_160,
!if(!eq(AddrDW, 6), VReg_192,
!if(!eq(AddrDW, 7), VReg_224,
!if(!eq(AddrDW, 8), VReg_256,
!if(!eq(AddrDW, 9), VReg_288,
!if(!eq(AddrDW, 10), VReg_320,
!if(!eq(AddrDW, 11), VReg_352,
!if(!eq(AddrDW, 12), VReg_384,
!if(!le(AddrDW, 16), VReg_512, ?))))))))))))));

// Whether the instruction variant with this vaddr size should be enabled for
// the auto-generated disassembler.
bit Disassemble = enable_disasm;
}

// Returns the MIMGAddrSize with the size of last VAddr for partial NSA
class LastVAddrSize <int dw, int max_idx, bit enable_disasm>
: MIMGAddrSize<dw, enable_disasm,
!if(!gt(dw, max_idx), !sub(dw, max_idx), 0)>;

// Return whether x is in lst.
class isIntInList<int x, list<int> lst> {
bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y)));
Expand All @@ -985,7 +1017,8 @@ class MIMGAddrSizes_dw_range<list<int> range> {
int Max = !if(!empty(!tail(range)), Min, !head(!tail(range)));
}

class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
int nsa_max_addr = 5> {
// List of all possible numbers of address words, taking all combinations of
// A16 and image dimension into account (note: no MSAA, since this is for
// sample/gather ops).
Expand Down Expand Up @@ -1031,6 +1064,21 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
!if(isIntInList<dw, AllNumAddrWords>.ret,
!listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]),
lhs))));

// In NSA format if there is a requirement for more VGPRs than the format
// supports, then the rest are sequential after the last one. Generate
// machine instructions for all possible number of words. The disassembler
// defaults to the largest number of arguments but no larger than max nsa
// size. List is generated with the register class needed for last vaddr since
// it is the only one that could have a register other than VGPR32.
int EnableDisasmNum = !foldl(!head(AllNumAddrWords), !tail(AllNumAddrWords),
acc, var, !if(!le(var, nsa_max_addr), var, acc));
list<LastVAddrSize> PartialNSAInstrs =
!foldl([]<LastVAddrSize>, [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], lhs, dw,
!if(isIntInList<dw, AllNumAddrWords>.ret,
!listconcat(lhs, [LastVAddrSize<dw, !sub(nsa_max_addr, 1),
!eq(dw, EnableDisasmNum)>]),
lhs));
}

multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
Expand Down Expand Up @@ -1066,9 +1114,14 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
: MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then {
}
}

foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 5/*MaxNSASize*/>.PartialNSAInstrs in {
let VAddrDwords = addr.NumWords in {
if op.HAS_GFX11 then {
def _V # addr.NumWords # _nsa_gfx11
: MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords,
: MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords, addr.RegClass,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
}
Expand Down
29 changes: 21 additions & 8 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6581,15 +6581,24 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
// TODO: we can actually allow partial NSA where the final register is a
// contiguous set of the remaining addresses.
// This could help where there are more addresses than supported.
bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) &&
VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
// Partial NSA is allowed on GFX11 where the final register is a contiguous
// set of the remaining addresses.
const unsigned NSAMaxSize = ST->getNSAMaxSize();
const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
const bool UseNSA = ST->hasNSAEncoding() &&
VAddrs.size() >= ST->getNSAThreshold(MF) &&
(VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
const bool UsePartialNSA =
UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;

SDValue VAddr;
if (!UseNSA)
if (UsePartialNSA) {
VAddr = getBuildDwordsVector(DAG, DL,
ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
}
else if (!UseNSA) {
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
}

SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
Expand Down Expand Up @@ -6657,7 +6666,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
if (UseNSA)
if (UsePartialNSA) {
append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
Ops.push_back(VAddr);
}
else if (UseNSA)
append_range(Ops, VAddrs);
else
Ops.push_back(VAddr);
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4636,9 +4636,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
unsigned VAddrWords;
if (IsNSA) {
VAddrWords = SRsrcIdx - VAddr0Idx;
if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) {
unsigned LastVAddrIdx = SRsrcIdx - 1;
VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
}
} else {
const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
if (AddrWords > 12)
AddrWords = 16;
}
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,10 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
unsigned NextVgpr = 0;
bool IsUndef = true;
bool IsKill = NewAddrDwords == Info->VAddrDwords;
for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
const unsigned NSAMaxSize = ST->getNSAMaxSize();
const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
Expand Down Expand Up @@ -361,13 +364,13 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
MI.getOperand(VAddr0Idx).setIsKill(IsKill);

for (int i = 1; i < Info->VAddrOperands; ++i)
for (unsigned i = 1; i < EndVAddr; ++i)
MI.removeOperand(VAddr0Idx + 1);

if (ToUntie >= 0) {
MI.tieOperands(
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
ToUntie - (Info->VAddrOperands - 1));
ToUntie - (EndVAddr - 1));
}
}

Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1945,6 +1945,15 @@ bool hasPackedD16(const MCSubtargetInfo &STI) {
!isSI(STI);
}

unsigned getNSAMaxSize(const MCSubtargetInfo &STI) {
auto Version = getIsaVersion(STI.getCPU());
if (Version.Major == 10)
return Version.Minor >= 3 ? 13 : 5;
if (Version.Major == 11)
return 5;
return 0;
}

bool isSI(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,7 @@ bool hasMIMG_R128(const MCSubtargetInfo &STI);
bool hasA16(const MCSubtargetInfo &STI);
bool hasG16(const MCSubtargetInfo &STI);
bool hasPackedD16(const MCSubtargetInfo &STI);
unsigned getNSAMaxSize(const MCSubtargetInfo &STI);

bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
Expand Down
1,717 changes: 1,716 additions & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<5 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -143,8 +143,8 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr9
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<10 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[BUILD_VECTOR2]](<10 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -223,8 +223,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr9
; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr10
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<11 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[BUILD_VECTOR2]](<11 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -305,8 +305,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr9
; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr10
; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<12 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[BUILD_VECTOR2]](<12 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,8 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16)
; GFX11-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16)
; GFX11-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -759,8 +759,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -1425,8 +1425,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down Expand Up @@ -1519,8 +1519,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
; GFX11-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 7)
; GFX11-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
Expand Down Expand Up @@ -1611,8 +1611,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
; GFX11-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 7)
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 7)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s

define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -21,6 +27,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -36,6 +50,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v1, v4, v3, 0x5040100
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v1, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -47,6 +69,12 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -60,6 +88,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -71,6 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_cl_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -84,6 +126,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_cl_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -95,6 +145,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_cl_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -111,6 +167,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_cl_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -129,6 +193,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_o_2darray_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100
; GFX11-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret float %v
Expand All @@ -147,6 +219,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_o_2darray_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <2 x float> %v
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -448,22 +448,20 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
;
; GFX11-LABEL: cluster_image_sample:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_add_f32 v3, 1.0, v9
; GFX11-NEXT: v_dual_mov_b32 v10, 1.0 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v8 :: v_dual_mov_b32 v5, v4
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_add_f32 v9, 2.0, v9
; GFX11-NEXT: v_dual_add_f32 v8, 2.0, v8 :: v_dual_mov_b32 v11, v10
; GFX11-NEXT: v_mov_b32_e32 v12, v10
; GFX11-NEXT: v_mov_b32_e32 v13, v10
; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v1
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_add_f32 v11, 2.0, v5
; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v5 :: v_dual_add_f32 v8, 1.0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_add_f32 v10, 2.0, v4
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_d v[2:5], [v8, v9, v2, v2, v[2:3]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_d v[6:9], [v10, v11, v6, v6, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v5, v5, v9
; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ main_body:
; GCN-LABEL: {{^}}sample_d_3d:
; GFX1010-NSA: image_sample_d v[0:3], v[7:15],
; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
; GFX11-NSA: image_sample_d v[0:3], v[7:15],
; GFX11-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v[9:13]],
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
341 changes: 340 additions & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll

Large diffs are not rendered by default.

19 changes: 9 additions & 10 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1631,12 +1631,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX11-LABEL: sample_c_d_o_2darray_V1_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v10, v0
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v[4:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v9
; GFX11-NEXT: global_store_b32 v11, v10, s[12:13]
; GFX11-NEXT: global_store_b32 v11, v1, s[12:13]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ; return to shader part epilog
main_body:
Expand Down Expand Up @@ -1709,13 +1709,12 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
;
; GFX11-LABEL: sample_c_d_o_2darray_V2_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, 0
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v10, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: image_sample_c_d_o v[0:2], [v11, v10, v9, v3, v[4:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
Expand Down
40 changes: 12 additions & 28 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX11-LABEL: sample_d_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
; GFX11-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
; GFX11-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x13,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08]
; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x03,0x05,0x06]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: ; return to shader part epilog
main_body:
Expand Down Expand Up @@ -172,12 +169,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX11-LABEL: sample_c_d_cl_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
; GFX11-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08]
; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: ; return to shader part epilog
main_body:
Expand All @@ -200,14 +194,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX11-LABEL: sample_c_d_o_2darray_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf]
; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08]
; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x04,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: ; return to shader part epilog
main_body:
Expand All @@ -230,14 +219,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
;
; GFX11-LABEL: sample_c_d_o_2darray_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf]
; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08]
; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x06,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: ; return to shader part epilog
main_body:
Expand Down
82 changes: 81 additions & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s

define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -21,6 +27,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -36,6 +50,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -47,6 +69,12 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -60,6 +88,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -71,6 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_cl_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -84,6 +126,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_d_cl_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -95,6 +145,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_cl_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -110,6 +166,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_cl_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -127,6 +191,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_o_2darray_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret float %v
Expand All @@ -144,6 +216,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_o_2darray_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <2 x float> %v
Expand Down
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/AMDGPU/verify-image-partial-nsa.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# RUN: not --crash llc -march=amdgcn -mcpu=gfx11 -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s

---
name: image_sample_d_v1v9_nsa_partial
body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
; GFX11-ERR: *** Bad machine code: Operand has incorrect register class. ***
; GFX11-ERR: - instruction: renamable $vgpr10 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; GFX11-ERR: *** Bad machine code: Illegal physical register for instruction ***
; GFX11-ERR: - instruction: renamable $vgpr10 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; GFX11-ERR: - operand 5: renamable $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-ERR: $vgpr4_vgpr5_vgpr6_vgpr7 is not a VReg_160 register.
renamable $vgpr10 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; GFX11-ERR: *** Bad machine code: Operand has incorrect register class. ***
; GFX11-ERR: - instruction: renamable $vgpr11 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; GFX11-ERR: *** Bad machine code: Illegal physical register for instruction ***
; GFX11-ERR: - instruction: renamable $vgpr11 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; GFX11-ERR: - operand 5: renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-ERR: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 is not a VReg_160 register.
renamable $vgpr11 = IMAGE_SAMPLE_D_V1_V9_nsa_gfx11 renamable $vgpr1, renamable $vgpr0, renamable $vgpr2, renamable $vgpr3, renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
...

# GFX11-ERR-NOT: *** Bad machine code
36 changes: 36 additions & 0 deletions llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,42 @@ image_sample_lz_o v[64:66], [v32, v16, v0, v2], s[4:11], s[100:103] dmask:0x7 di
image_sample_c_lz_o v[64:66], [v32, v0, v16], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D
// GFX11: image_sample_c_lz_o v[64:66], [v32, v0, v16], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x07,0xb8,0xf0,0x20,0x40,0x01,0x64,0x00,0x10,0x00,0x00]

image_sample_d v64, [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0x8 dim:SQ_RSRC_IMG_2D
// GFX11: image_sample_d v64, [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x08,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE
// GFX11: image_sample_d v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE
// GFX11: image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_d v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:41]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:41]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_c_d_o v[64:65], [v32, v16, v8, v0, v[36:42]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_c_d_o v[64:65], [v32, v16, v8, v0, v[36:42]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xac,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_c_d_cl_o v[64:65], [v32, v16, v8, v0, v[36:43]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_c_d_cl_o v[64:65], [v32, v16, v8, v0, v[36:43]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x28,0xf1,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d_o_g16 v[64:66], [v32, v16, v8, v0, v36], s[4:11], s[4:7] dmask:0xb dim:SQ_RSRC_IMG_1D_ARRAY
// GFX11: image_sample_d_o_g16 v[64:66], [v32, v16, v8, v0, v36], s[4:11], s[4:7] dmask:0xb dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x0b,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE
// GFX11: image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE
// GFX11: image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]

image_gather4 v[64:67], v32, s[4:11], s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
// GFX11: image_gather4 v[64:67], v32, s[4:11], s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0xbc,0xf0,0x20,0x40,0x01,0x64]

Expand Down
36 changes: 36 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,42 @@
# GFX11: image_sample_c_lz_o v[64:66], [v32, v0, v16], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x07,0xb8,0xf0,0x20,0x40,0x01,0x64,0x00,0x10,0x00,0x00]
0x01,0x07,0xb8,0xf0,0x20,0x40,0x01,0x64,0x00,0x10,0x00,0x00

# GFX11: image_sample_d v64, [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x08,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x05,0x08,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x0d,0x0c,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x0d,0x0c,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0x70,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d_o v[64:65], [v32, v16, v8, v0, v[36:41]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0x98,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_c_d_o v[64:65], [v32, v16, v8, v0, v[36:42]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xac,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0xac,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_c_d_cl_o v[64:65], [v32, v16, v8, v0, v[36:43]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0x28,0xf1,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0x28,0xf1,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d_o_g16 v[64:66], [v32, v16, v8, v0, v36], s[4:11], s[4:7] dmask:0xb dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x0b,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x11,0x0b,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:37]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x0d,0x0c,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:38]], s[4:11], s[4:7] dmask:0xc dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0d,0x0c,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x0d,0x0c,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:39]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0xec,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_sample_c_d_o_g16 v[64:65], [v32, v16, v8, v0, v[36:40]], s[4:11], s[4:7] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x03,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24]
0x09,0x03,0xf0,0xf0,0x20,0x40,0x01,0x04,0x10,0x08,0x00,0x24

# GFX11: image_gather4 v[64:67], v32, s[4:11], s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0xbc,0xf0,0x20,0x40,0x01,0x64]
0x00,0x01,0xbc,0xf0,0x20,0x40,0x01,0x64

Expand Down