Skip to content

Commit

Permalink
[AMDGPU] Implemented dwordx3 variants of buffer/tbuffer load/store in…
Browse files Browse the repository at this point in the history
…trinsics

Now we have vec3 MVTs, this commit implements dwordx3 variants of the
buffer intrinsics.

On gfx6, a dwordx3 buffer load intrinsic is implemented as a dwordx4
instruction, and a dwordx3 buffer store intrinsic is not supported.
We need to support the dwordx3 load intrinsic because it is generated by
subtarget-unaware code in InstCombine.

Differential Revision: https://reviews.llvm.org/D58904

Change-Id: I016729d8557b98a52f529638ae97c340a5922a4e
llvm-svn: 356755
  • Loading branch information
Tim Renouf committed Mar 22, 2019
1 parent f95351b commit 677387d
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 27 deletions.
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -4295,7 +4295,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Expand Up @@ -494,7 +494,6 @@ enum NodeType : unsigned {
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
TBUFFER_STORE_FORMAT_X3,
TBUFFER_STORE_FORMAT_D16,
TBUFFER_LOAD_FORMAT,
TBUFFER_LOAD_FORMAT_D16,
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Expand Up @@ -1011,11 +1011,11 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",

defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>;
defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;

let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
Expand Down Expand Up @@ -1104,6 +1104,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3f32, "BUFFER_LOAD_FORMAT_XYZ">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3i32, "BUFFER_LOAD_FORMAT_XYZ">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;

Expand All @@ -1127,6 +1129,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
Expand Down Expand Up @@ -1172,6 +1176,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;

Expand All @@ -1195,6 +1201,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
Expand Down Expand Up @@ -1535,9 +1543,11 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,

defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;

let SubtargetPredicate = HasUnpackedD16VMem in {
Expand Down Expand Up @@ -1591,11 +1601,11 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,

defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;

let SubtargetPredicate = HasUnpackedD16VMem in {
Expand Down
63 changes: 48 additions & 15 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -5625,8 +5625,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
LoadVT.getScalarType() == MVT::i16)
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
Expand Down Expand Up @@ -5659,8 +5659,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
LoadVT.getScalarType() == MVT::i16)
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
Expand Down Expand Up @@ -5693,8 +5693,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
LoadVT.getScalarType() == MVT::i16)
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
Expand Down Expand Up @@ -5722,9 +5722,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
case Intrinsic::amdgcn_raw_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
Expand All @@ -5746,9 +5746,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
case Intrinsic::amdgcn_struct_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
Expand All @@ -5770,9 +5770,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
Expand Down Expand Up @@ -6047,6 +6047,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
}

// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.
SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO,
SelectionDAG &DAG) const {
EVT VT = VTList.VTs[0];
EVT WidenedVT = VT;
EVT WidenedMemVT = MemVT;
if (!Subtarget->hasDwordx3LoadStores() &&
(WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
WidenedVT = EVT::getVectorVT(*DAG.getContext(),
WidenedVT.getVectorElementType(), 4);
WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
WidenedMemVT.getVectorElementType(), 4);
MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
}

assert(VTList.NumVTs == 2);
SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);

auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
WidenedMemVT, MMO);
if (WidenedVT != VT) {
auto Extract = DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
}
return NewOp;
}

SDValue SITargetLowering::handleD16VData(SDValue VData,
SelectionDAG &DAG) const {
EVT StoreVT = VData.getValueType();
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Expand Up @@ -94,6 +94,12 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;

// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) const;

SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;

/// Converts \p Op, which must be of floating point type, to the
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Expand Up @@ -108,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,

def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
@@ -0,0 +1,60 @@
;RUN: llc < %s -march=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI
;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3

;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3:
;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42
;CHECK: s_waitcnt
define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) {
main_body:
%data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
ret <3 x float> %data
}

;CHECK-LABEL: {{^}}buffer_load_immoffs_x3:
;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
;CHECK: s_waitcnt
define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) {
main_body:
%data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
ret <3 x float> %data
}

;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3:
;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
;CHECK: s_waitcnt
define amdgpu_ps <3 x float> @buffer_raw_load_immoffs_x3(<4 x i32> inreg) {
main_body:
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %0, i32 40, i32 0, i32 0)
ret <3 x float> %data
}

;CHECK-LABEL: {{^}}buffer_struct_load_format_immoffs_x3:
;SI: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
;GCNX3: buffer_load_format_xyz v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
;CHECK: s_waitcnt
define amdgpu_ps <3 x float> @buffer_struct_load_format_immoffs_x3(<4 x i32> inreg) {
main_body:
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
ret <3 x float> %data
}

;CHECK-LABEL: {{^}}struct_buffer_load_immoffs_x3:
;SI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;GCNX3: buffer_load_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;CHECK: s_waitcnt
define amdgpu_ps <3 x float> @struct_buffer_load_immoffs_x3(<4 x i32> inreg) {
main_body:
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0)
ret <3 x float> %data
}

declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0
declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0
declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0
declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #0

53 changes: 53 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
@@ -0,0 +1,53 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK

;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) {
main_body:
call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
ret void
}

;CHECK-LABEL: {{^}}buffer_store_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42
define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
main_body:
call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
ret void
}

;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
define amdgpu_ps void @raw_buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) {
main_body:
call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
ret void
}

;CHECK-LABEL: {{^}}raw_buffer_store_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42
define amdgpu_ps void @raw_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
main_body:
call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
ret void
}

;CHECK-LABEL: {{^}}struct_buffer_store_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
define amdgpu_ps void @struct_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
main_body:
call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
ret void
}

declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0

0 comments on commit 677387d

Please sign in to comment.