[AMDGPU] Add intrinsic for converting global pointers to resources

Define the function @llvm.amdgcn.make.buffer.rsrc, which take a 64-bit pointer, the 16-bit stride/swizzling constant that replace the high 16 bits of an address in a buffer resource, the 32-bit extent/number of elements, and the 32-bit flags (the latter two being the 3rd and 4th wards of the resource), and combines them into a ptr addrspace(8). This intrinsic is lowered during the early phases of the backend. This intrinsic is needed so that alias analysis can correctly infer that a certain buffer resource points to the same memory as some global pointer. Previous methods of constructing buffer resources, which relied on ptrtoint, would not allow for such an inference. Depends on D148184 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D148957
llvm · Jun 5, 2023 · 23098bd · 23098bd
1 parent ab37937
commit 23098bd
Show file tree

Hide file tree

Showing 12 changed files with 643 additions and 5 deletions.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -787,6 +787,12 @@ supported for the ``amdgcn`` target.
   access is not supported except by flat and scratch instructions in
   GFX9-GFX11.
 
+  Code that manipulates the stack values in other lanes of a wavefront,
+  such as by `addrspacecast`ing stack pointers to generic ones and taking offsets
+  that reach other lanes or by explicitly constructing the scratch buffer descriptor,
+  triggers undefined behavior when it modifies the scratch values of other lanes.
+  The compiler may assume that such modifications do not occur.
+
 **Constant 32-bit**
   *TODO*
 
@@ -806,9 +812,10 @@ supported for the ``amdgcn`` target.
   it or not).
 
 **Buffer Resource**
-  The buffer resource is an experimental address space that is currently unsupported
-  in the backend. It exposes a non-integral pointer that will represent a 128-bit
-  buffer descriptor resource.
+  The buffer resource pointer, in address space 8, is the newer form
+  for representing buffer descriptors in AMDGPU IR, replacing their
+  previous representation as `<4 x i32>`. It is a non-integral pointer
+  that represents a 128-bit buffer descriptor resource (`V#`).
 
   Since, in general, a buffer resource supports complex addressing modes that cannot
   be easily represented in LLVM (such as implicit swizzled access to structured
@@ -819,6 +826,14 @@ supported for the ``amdgcn`` target.
   Casting a buffer resource to a buffer fat pointer is permitted and adds an offset
   of 0.
 
+  Buffer resources can be created from 64-bit pointers (which should be either
+  generic or global) using the `llvm.amdgcn.make.buffer.rsrc` intrinsic, which
+  takes the pointer, which becomes the base of the resource,
+  the 16-bit stride (and swzizzle control) field stored in bits `63:48` of a `V#`,
+  the 32-bit NumRecords/extent field (bits `95:64`), and the 32-bit flags field
+  (bits `127:96`). The specific interpretation of these fields varies by the
+  target architecture and is detailed in the ISA descriptions.
+
 **Streamout Registers**
   Dedicated registers used by the GS NGG Streamout Instructions. The register
   file is modelled as a memory in a distinct address space because it is indexed

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -997,6 +997,16 @@ class AMDGPUBufferRsrcTy<LLVMType data_ty = llvm_any_ty>
 
 let TargetPrefix = "amdgcn" in {
 
+def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
+  [AMDGPUBufferRsrcTy<llvm_i8_ty>],
+  [llvm_anyptr_ty, // base
+   llvm_i16_ty,    // stride (and swizzle control)
+   llvm_i32_ty,    // NumRecords / extent
+   llvm_i32_ty],   // flags
+  // Attributes lifted from ptrmask + some extra argument attributes.
+  [IntrNoMem, NoCapture<ArgIndex<0>>, ReadNone<ArgIndex<0>>,
+   IntrSpeculatable, IntrWillReturn]>;
+
 defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
 
 class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
@@ -5608,6 +5609,16 @@ bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
   case Intrinsic::strip_invariant_group:
   case Intrinsic::aarch64_irg:
   case Intrinsic::aarch64_tagp:
+  // The amdgcn_make_buffer_rsrc function does not alter the address of the
+  // input pointer (and thus preserve null-ness for the purposes of escape
+  // analysis, which is where the MustPreserveNullness flag comes in to play).
+  // However, it will not necessarily map ptr addrspace(N) null to ptr
+  // addrspace(8) null, aka the "null descriptor", which has "all loads return
+  // 0, all stores are dropped" semantics. Given the context of this intrinsic
+  // list, no one should be relying on such a strict interpretation of
+  // MustPreserveNullness (and, at time of writing, they are not), but we
+  // document this fact out of an abundance of caution.
+  case Intrinsic::amdgcn_make_buffer_rsrc:
     return true;
   case Intrinsic::ptrmask:
     return !MustPreserveNullness;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
@@ -4423,6 +4424,50 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
   return true;
 }
 
+/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
+/// bits of the pointer and replace them with the stride argument, then
+/// merge_values everything together. In the common case of a raw buffer (the
+/// stride component is 0), we can just AND off the upper half.
+bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+  Register Result = MI.getOperand(0).getReg();
+  Register Pointer = MI.getOperand(2).getReg();
+  Register Stride = MI.getOperand(3).getReg();
+  Register NumRecords = MI.getOperand(4).getReg();
+  Register Flags = MI.getOperand(5).getReg();
+
+  LLT S32 = LLT::scalar(32);
+
+  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+  auto Unmerge = B.buildUnmerge(S32, Pointer);
+  Register LowHalf = Unmerge.getReg(0);
+  Register HighHalf = Unmerge.getReg(1);
+
+  auto AndMask = B.buildConstant(S32, 0x0000ffff);
+  auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+  MachineInstrBuilder NewHighHalf = Masked;
+  std::optional<ValueAndVReg> StrideConst =
+      getIConstantVRegValWithLookThrough(Stride, MRI);
+  if (!StrideConst || !StrideConst->Value.isZero()) {
+    MachineInstrBuilder ShiftedStride;
+    if (StrideConst) {
+      uint32_t StrideVal = StrideConst->Value.getZExtValue();
+      uint32_t ShiftedStrideVal = StrideVal << 16;
+      ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+    } else {
+      auto ExtStride = B.buildAnyExt(S32, Stride);
+      auto ShiftConst = B.buildConstant(S32, 16);
+      ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+    }
+    NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+  }
+  Register NewHighHalfReg = NewHighHalf.getReg(0);
+  B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
                                                  MachineRegisterInfo &MRI,
                                                  MachineIRBuilder &B) const {
@@ -5959,6 +6004,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
     return false;
   }
+  case Intrinsic::amdgcn_make_buffer_rsrc:
+    return legalizePointerAsRsrcIntrin(MI, MRI, B);
   case Intrinsic::amdgcn_kernarg_segment_ptr:
     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
       // This only makes sense to call in a kernel, so just lower to null.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -102,6 +102,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
+  bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &B) const;
+
   bool legalizePreloadedArgIntrin(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -5122,6 +5124,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     switch (IID) {
+    case Intrinsic::amdgcn_make_buffer_rsrc:
+      Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
+      return;
     case Intrinsic::amdgcn_cvt_pkrtz: {
       SDValue Src0 = N->getOperand(1);
       SDValue Src1 = N->getOperand(2);
@@ -8667,7 +8672,7 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
                                         Align Alignment) const {
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   SDLoc DL(CombinedOffset);
-  if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
+  if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
     uint32_t Imm = C->getZExtValue();
     uint32_t SOffset, ImmOffset;
     if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
@@ -8706,6 +8711,44 @@ SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
   return Rsrc;
 }
 
+// Wrap a global or flat pointer into a buffer intrinsic using the flags
+// specified in the intrinsic.
+SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc Loc(Op);
+
+  SDValue Pointer = Op->getOperand(1);
+  SDValue Stride = Op->getOperand(2);
+  SDValue NumRecords = Op->getOperand(3);
+  SDValue Flags = Op->getOperand(4);
+
+  auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
+  SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
+  SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
+  std::optional<uint32_t> ConstStride = std::nullopt;
+  if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+    ConstStride = ConstNode->getZExtValue();
+
+  SDValue NewHighHalf = Masked;
+  if (!ConstStride || *ConstStride != 0) {
+    SDValue ShiftedStride;
+    if (ConstStride) {
+      ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
+    } else {
+      SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
+      ShiftedStride =
+          DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
+                      DAG.getShiftAmountConstant(16, MVT::i32, Loc));
+    }
+    NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
+  }
+
+  SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
+                             NewHighHalf, NumRecords, Flags);
+  SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
+  return RsrcPtr;
+}
+
 // Handle 8 bit and 16 bit buffer loads
 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
                                                      EVT LoadVT, SDLoc DL,

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -259,6 +259,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   // argument (as would be seen in older buffer intrinsics), does nothing.
   SDValue bufferRsrcPtrToVector(SDValue MaybePointer, SelectionDAG &DAG) const;
 
+  // Wrap a 64-bit pointer into a v4i32 (which is how all SelectionDAG code
+  // represents ptr addrspace(8)) using the flags specified in the intrinsic.
+  SDValue lowerPointerAsRsrcIntrin(SDNode *Op, SelectionDAG &DAG) const;
+
   // Handle 8 bit and 16 bit buffer loads
   SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
                                      ArrayRef<SDValue> Ops, MemSDNode *M) const;