From 23098bd4542e76b897696f1d542a26caa5c25020 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 19 Apr 2023 15:19:14 +0000 Subject: [PATCH] [AMDGPU] Add intrinsic for converting global pointers to resources Define the function @llvm.amdgcn.make.buffer.rsrc, which take a 64-bit pointer, the 16-bit stride/swizzling constant that replace the high 16 bits of an address in a buffer resource, the 32-bit extent/number of elements, and the 32-bit flags (the latter two being the 3rd and 4th wards of the resource), and combines them into a ptr addrspace(8). This intrinsic is lowered during the early phases of the backend. This intrinsic is needed so that alias analysis can correctly infer that a certain buffer resource points to the same memory as some global pointer. Previous methods of constructing buffer resources, which relied on ptrtoint, would not allow for such an inference. Depends on D148184 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D148957 --- llvm/docs/AMDGPUUsage.rst | 21 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 + llvm/lib/Analysis/ValueTracking.cpp | 11 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 47 ++++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 45 +++- llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 + .../llvm.amdgcn.make.buffer.rsrc.ll | 232 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll | 163 ++++++++++++ .../AMDGPU/make-buffer-rsrc-lds-fails.ll | 8 + .../AMDGPU/ptr-buffer-alias-scheduling.ll | 61 +++++ .../LICM/AMDGPU/buffer-rsrc-ptrs.ll | 43 +++- 12 files changed, 643 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 4299a1a56c172..3e566e65cf2eb 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -787,6 +787,12 @@ supported for the ``amdgcn`` target. access is not supported except by flat and scratch instructions in GFX9-GFX11. + Code that manipulates the stack values in other lanes of a wavefront, + such as by `addrspacecast`ing stack pointers to generic ones and taking offsets + that reach other lanes or by explicitly constructing the scratch buffer descriptor, + triggers undefined behavior when it modifies the scratch values of other lanes. + The compiler may assume that such modifications do not occur. + **Constant 32-bit** *TODO* @@ -806,9 +812,10 @@ supported for the ``amdgcn`` target. it or not). **Buffer Resource** - The buffer resource is an experimental address space that is currently unsupported - in the backend. It exposes a non-integral pointer that will represent a 128-bit - buffer descriptor resource. + The buffer resource pointer, in address space 8, is the newer form + for representing buffer descriptors in AMDGPU IR, replacing their + previous representation as `<4 x i32>`. It is a non-integral pointer + that represents a 128-bit buffer descriptor resource (`V#`). Since, in general, a buffer resource supports complex addressing modes that cannot be easily represented in LLVM (such as implicit swizzled access to structured @@ -819,6 +826,14 @@ supported for the ``amdgcn`` target. Casting a buffer resource to a buffer fat pointer is permitted and adds an offset of 0. + Buffer resources can be created from 64-bit pointers (which should be either + generic or global) using the `llvm.amdgcn.make.buffer.rsrc` intrinsic, which + takes the pointer, which becomes the base of the resource, + the 16-bit stride (and swzizzle control) field stored in bits `63:48` of a `V#`, + the 32-bit NumRecords/extent field (bits `95:64`), and the 32-bit flags field + (bits `127:96`). The specific interpretation of these fields varies by the + target architecture and is detailed in the ISA descriptions. + **Streamout Registers** Dedicated registers used by the GS NGG Streamout Instructions. The register file is modelled as a memory in a distinct address space because it is indexed diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index b0aa26479424f..82a13d576e03b 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -997,6 +997,16 @@ class AMDGPUBufferRsrcTy let TargetPrefix = "amdgcn" in { +def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < + [AMDGPUBufferRsrcTy], + [llvm_anyptr_ty, // base + llvm_i16_ty, // stride (and swizzle control) + llvm_i32_ty, // NumRecords / extent + llvm_i32_ty], // flags + // Attributes lifted from ptrmask + some extra argument attributes. + [IntrNoMem, NoCapture>, ReadNone>, + IntrSpeculatable, IntrWillReturn]>; + defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : DefaultAttrsIntrinsic < diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 94b409527be02..c1c38c60d6897 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" @@ -5608,6 +5609,16 @@ bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing( case Intrinsic::strip_invariant_group: case Intrinsic::aarch64_irg: case Intrinsic::aarch64_tagp: + // The amdgcn_make_buffer_rsrc function does not alter the address of the + // input pointer (and thus preserve null-ness for the purposes of escape + // analysis, which is where the MustPreserveNullness flag comes in to play). + // However, it will not necessarily map ptr addrspace(N) null to ptr + // addrspace(8) null, aka the "null descriptor", which has "all loads return + // 0, all stores are dropped" semantics. Given the context of this intrinsic + // list, no one should be relying on such a strict interpretation of + // MustPreserveNullness (and, at time of writing, they are not), but we + // document this fact out of an abundance of caution. + case Intrinsic::amdgcn_make_buffer_rsrc: return true; case Intrinsic::ptrmask: return !MustPreserveNullness; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 2f54f5cfdcd55..c8485ea4f9441 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -4423,6 +4424,50 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, return true; } +/// To create a buffer resource from a 64-bit pointer, mask off the upper 32 +/// bits of the pointer and replace them with the stride argument, then +/// merge_values everything together. In the common case of a raw buffer (the +/// stride component is 0), we can just AND off the upper half. +bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register Result = MI.getOperand(0).getReg(); + Register Pointer = MI.getOperand(2).getReg(); + Register Stride = MI.getOperand(3).getReg(); + Register NumRecords = MI.getOperand(4).getReg(); + Register Flags = MI.getOperand(5).getReg(); + + LLT S32 = LLT::scalar(32); + + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + auto Unmerge = B.buildUnmerge(S32, Pointer); + Register LowHalf = Unmerge.getReg(0); + Register HighHalf = Unmerge.getReg(1); + + auto AndMask = B.buildConstant(S32, 0x0000ffff); + auto Masked = B.buildAnd(S32, HighHalf, AndMask); + + MachineInstrBuilder NewHighHalf = Masked; + std::optional StrideConst = + getIConstantVRegValWithLookThrough(Stride, MRI); + if (!StrideConst || !StrideConst->Value.isZero()) { + MachineInstrBuilder ShiftedStride; + if (StrideConst) { + uint32_t StrideVal = StrideConst->Value.getZExtValue(); + uint32_t ShiftedStrideVal = StrideVal << 16; + ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); + } else { + auto ExtStride = B.buildAnyExt(S32, Stride); + auto ShiftConst = B.buildConstant(S32, 16); + ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); + } + NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); + } + Register NewHighHalfReg = NewHighHalf.getReg(0); + B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -5959,6 +6004,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return false; } + case Intrinsic::amdgcn_make_buffer_rsrc: + return legalizePointerAsRsrcIntrin(MI, MRI, B); case Intrinsic::amdgcn_kernarg_segment_ptr: if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { // This only makes sense to call in a kernel, so just lower to null. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index d1ab640a33af0..24c2ff68897c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -102,6 +102,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3a27571681e88..7536d3f61519d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -5122,6 +5124,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); switch (IID) { + case Intrinsic::amdgcn_make_buffer_rsrc: + Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); + return; case Intrinsic::amdgcn_cvt_pkrtz: { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); @@ -8667,7 +8672,7 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Align Alignment) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); SDLoc DL(CombinedOffset); - if (auto C = dyn_cast(CombinedOffset)) { + if (auto *C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { @@ -8706,6 +8711,44 @@ SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, return Rsrc; } +// Wrap a global or flat pointer into a buffer intrinsic using the flags +// specified in the intrinsic. +SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, + SelectionDAG &DAG) const { + SDLoc Loc(Op); + + SDValue Pointer = Op->getOperand(1); + SDValue Stride = Op->getOperand(2); + SDValue NumRecords = Op->getOperand(3); + SDValue Flags = Op->getOperand(4); + + auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); + SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); + SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); + std::optional ConstStride = std::nullopt; + if (auto *ConstNode = dyn_cast(Stride)) + ConstStride = ConstNode->getZExtValue(); + + SDValue NewHighHalf = Masked; + if (!ConstStride || *ConstStride != 0) { + SDValue ShiftedStride; + if (ConstStride) { + ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); + } else { + SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); + ShiftedStride = + DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, + DAG.getShiftAmountConstant(16, MVT::i32, Loc)); + } + NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); + } + + SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, + NewHighHalf, NumRecords, Flags); + SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); + return RsrcPtr; +} + // Handle 8 bit and 16 bit buffer loads SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index a4772478f4bac..75215a7a4f799 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -259,6 +259,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { // argument (as would be seen in older buffer intrinsics), does nothing. SDValue bufferRsrcPtrToVector(SDValue MaybePointer, SelectionDAG &DAG) const; + // Wrap a 64-bit pointer into a v4i32 (which is how all SelectionDAG code + // represents ptr addrspace(8)) using the flags specified in the intrinsic. + SDValue lowerPointerAsRsrcIntrin(SDNode *Op, SelectionDAG &DAG) const; + // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, ArrayRef Ops, MemSDNode *M) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll new file mode 100644 index 0000000000000..a1d36c0c7ea19 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_raw_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { + ; CHECK-LABEL: name: read_raw_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + ret float %loaded +} + +define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_struct_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_3]], implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: variable_top_half + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case_load + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_OR_B32_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +; None of the components are uniform due to the lack of an inreg +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { + ; CHECK-LABEL: name: general_case_load_with_waterfall + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY6]], [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]], [[COPY5]], [[V_LSHLREV_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_AND_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY7]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll new file mode 100644 index 0000000000000..8cca7388d7595 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_raw_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { + ; CHECK-LABEL: name: read_raw_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + ret float %loaded +} + +define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_struct_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: variable_top_half + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: $sgpr0 = COPY [[COPY3]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc + ; CHECK-NEXT: $sgpr0 = COPY [[COPY4]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case_load + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[S_OR_B32_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +; None of the components are uniform due to the lack of an inreg +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { + ; CHECK-LABEL: name: general_case_load_with_waterfall + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], killed [[V_LSHLREV_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll new file mode 100644 index 0000000000000..8bdcfd6bcf295 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll @@ -0,0 +1,8 @@ +; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 < %s +; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx900 < %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr addrspace(3) inreg %p) { + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) nocapture readnone, i16, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index fbf86f1f443d5..c71677258ea0e 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -47,6 +47,66 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ret void } +define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { +; SDAG-LABEL: buffers_from_flat_dont_alias: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s7, 0 +; SDAG-NEXT: s_mov_b32 s6, 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_and_b32 s5, s1, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_and_b32 s5, s3, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 +; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 +; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: buffers_from_flat_dont_alias: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_and_b32 s5, s1, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_and_b32 s5, s3, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 +; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 +; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_endpgm + %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) + %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) + + %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) + %s0 = fmul float %l0, %l0 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s0, ptr addrspace(8) %b, i32 0, i32 0, i32 0) + + %l1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 4, i32 0, i32 0) + %s1 = fmul float %l1, %l1 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s1, ptr addrspace(8) %b, i32 4, i32 0, i32 0) + + %l2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 8, i32 0, i32 0) + %s2 = fmul float %l2, %l2 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s2, ptr addrspace(8) %b, i32 8, i32 0, i32 0) + + %l3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 12, i32 0, i32 0) + %s3 = fmul float %l3, %l3 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s3, ptr addrspace(8) %b, i32 12, i32 0, i32 0) + + ret void +} + define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: @@ -151,3 +211,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr readnone nocapture, i16, i32, i32) diff --git a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll index 384eb5a2e4207..2e539d03afc1c 100644 --- a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll +++ b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll @@ -161,6 +161,45 @@ tail: ret void } +define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p.global, ptr addrspace(1) noalias %q.global, i32 %bound) { +; CHECK-LABEL: define void @hoistable_buffer_construction_intrinsic +; CHECK-SAME: (ptr addrspace(1) noalias [[P_GLOBAL:%.*]], ptr addrspace(1) noalias [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i32 0, i32 0) +; CHECK-NEXT: [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i32 0, i32 0) +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] +; CHECK: tail: +; CHECK-NEXT: ret void +; +entry: + %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p.global, i16 0, i32 0, i32 0) + %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %q.global, i16 0, i32 0, i32 0) + br label %loop +loop: + %i = phi i32 [0, %entry], [%next, %loop] + + %hoistable = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) %q, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + %inc = add i32 %hoistable, %orig + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %inc, ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %bound + br i1 %cond, label %loop, label %tail +tail: + ret void +} + + define void @hoistable_buffer_construction_alias_scope(ptr addrspace(1) %p.global, ptr addrspace(1) %q.global, i32 %bound) { ; CHECK-LABEL: define void @hoistable_buffer_construction_alias_scope ; CHECK-SAME: (ptr addrspace(1) [[P_GLOBAL:%.*]], ptr addrspace(1) [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) { @@ -217,6 +256,8 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture read declare i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) #0 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #1 - +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) #2 +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone nocapture, i16, i32, i32) attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }