[AMDGPU] Add support for preloading implicit kernel arguments #83817

kerbowa · 2024-03-04T10:13:14Z

Implicit arguments may be preloaded into User SGPRs via the same
mechanism as explicit arguments if their offsets within the kernarg
segment fall within the range of available registers. Lowering of these
implicit arguments may happen early so the implementation here follows
the same concept and is mostly agnostic to which values are being
loaded, and instead only cares about offsets from the implicitarg
pointer and the size of the values being used. Unlike preloading of
explicit arguments there are not restrictions on exactly which implicit
arguments are used and whether there is a unbroken sequence of used
arguments, but instead this will attempt to preload anything that falls
within the range of available User SGPRs on the target HW.

A limitation of this patch is that it only supports i16/i32 arguments,
but like other details of preloading kernargs for both implicit and
explicit arguments this is likely to be expanded and changed in the near
future.

llvmbot · 2024-03-04T10:13:46Z

@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-ir

Author: Austin Kerbow (kerbowa)

Changes

WIP

Patch is 42.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83817.diff

8 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+13-9)
(modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h (+1)
(modified) llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp (+122-2)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+124-3)
(modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+22-1)
(modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+21-2)
(modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+2)
(added) llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll (+528)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0f29653f1f5bec..84b0cde1982558 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -12,8 +12,8 @@
 
 def global_ptr_ty : LLVMQualPointerType<1>;
 
-class AMDGPUReadPreloadRegisterIntrinsic
-  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
+class AMDGPUReadPreloadRegisterIntrinsic<LLVMType type>
+  : DefaultAttrsIntrinsic<[type], [], [IntrNoMem, IntrSpeculatable]>;
 
 class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
@@ -27,10 +27,10 @@ class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
 
 let TargetPrefix = "r600" in {
 
-multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
-  def _x : AMDGPUReadPreloadRegisterIntrinsic;
-  def _y : AMDGPUReadPreloadRegisterIntrinsic;
-  def _z : AMDGPUReadPreloadRegisterIntrinsic;
+multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz<LLVMType type> {
+  def _x : AMDGPUReadPreloadRegisterIntrinsic<type>;
+  def _y : AMDGPUReadPreloadRegisterIntrinsic<type>;
+  def _z : AMDGPUReadPreloadRegisterIntrinsic<type>;
 }
 
 multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
@@ -46,8 +46,8 @@ defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
 defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                           <"__builtin_r600_read_tgid">;
 
-defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
-defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
+defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
 
 def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
   Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
@@ -138,10 +138,14 @@ let TargetPrefix = "amdgcn" in {
 // ABI Special Intrinsics
 //===----------------------------------------------------------------------===//
 
-defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
 defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                                <"__builtin_amdgcn_workgroup_id">;
 
+// Intened to be used when preloading implicit kernel arguments.
+defm int_amdgcn_workgroup_size :
+  AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i16_ty>;
+
 def int_amdgcn_dispatch_ptr :
   DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..e6aed12673c941 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -95,6 +95,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
 struct KernArgPreloadDescriptor : public ArgDescriptor {
   KernArgPreloadDescriptor() {}
   SmallVector<MCRegister> Regs;
+  unsigned ByteOffset;
 };
 
 struct AMDGPUFunctionArgInfo {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index bc58407a73294c..03544279b49efe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,8 +13,10 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetMachine.h"
@@ -31,9 +33,13 @@ class PreloadKernelArgInfo {
   const GCNSubtarget &ST;
   unsigned NumFreeUserSGPRs;
 
-public:
-  SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
+  enum ImplicitArgOffsets {
+    HIDDEN_GROUP_SIZE_X_OFFSET = 12,
+    HIDDEN_GROUP_SIZE_Y_OFFSET = 14,
+    HIDDEN_GROUP_SIZE_Z_OFFSET = 16,
+  };
 
+public:
   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
     setInitialFreeUserSGPRsCount();
   }
@@ -64,6 +70,111 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    IntrinsicInst *ImplicitArgPtr = nullptr;
+    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+      for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+            ImplicitArgPtr = CI;
+            break;
+          }
+      }
+    }
+    if (!ImplicitArgPtr)
+      return;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+    for (auto *U : ImplicitArgPtr->users()) {
+      if (!U->hasOneUse())
+        continue;
+
+      // FIXME: The loop below is mostly copied from
+      // AMDGPULowerKernelAttributes.cpp, should combine the logic somewhere.
+      int64_t Offset = 0;
+      auto *Load =
+          dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
+      auto *BCI = dyn_cast<BitCastInst>(U);
+      if (!Load && !BCI) {
+        if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+          continue;
+        Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+        BCI = dyn_cast<BitCastInst>(*U->user_begin());
+      }
+
+      if (BCI) {
+        if (!BCI->hasOneUse())
+          continue;
+        Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
+      }
+
+      if (!Load || !Load->isSimple())
+        continue;
+
+      unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+      switch (Offset) {
+      case HIDDEN_GROUP_SIZE_X_OFFSET:
+        if (LoadSize == 2)
+          GroupSizes[0] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Y_OFFSET:
+        if (LoadSize == 2)
+          GroupSizes[1] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Z_OFFSET:
+        if (LoadSize == 2)
+          GroupSizes[2] = Load;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // If we fail to preload any implicit argument we know we don't have SGPRs
+    // to preload any subsequent ones with larger offsets.
+    if (GroupSizes[0]) {
+      if (!tryAllocPreloadSGPRs(
+              2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_X_OFFSET,
+              LastExplicitArgOffset))
+        return;
+      LastExplicitArgOffset =
+          ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_X_OFFSET + 2;
+      CallInst *CI =
+          Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_x, {}, {});
+      GroupSizes[0]->replaceAllUsesWith(CI);
+      F.addFnAttr("amdgpu-preload-work-group-size-x");
+    }
+
+    if (GroupSizes[1]) {
+      if (!tryAllocPreloadSGPRs(
+              2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Y_OFFSET,
+              LastExplicitArgOffset))
+        return;
+      LastExplicitArgOffset =
+          ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Y_OFFSET + 2;
+      CallInst *CI =
+          Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_y, {}, {});
+      GroupSizes[1]->replaceAllUsesWith(CI);
+      F.addFnAttr("amdgpu-preload-work-group-size-y");
+    }
+
+    if (GroupSizes[2]) {
+      if (!tryAllocPreloadSGPRs(
+              2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Z_OFFSET,
+              LastExplicitArgOffset))
+        return;
+      LastExplicitArgOffset =
+          ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Z_OFFSET + 2;
+      CallInst *CI =
+          Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_z, {}, {});
+      GroupSizes[2]->replaceAllUsesWith(CI);
+      F.addFnAttr("amdgpu-preload-work-group-size-z");
+    }
+  }
 };
 
 class AMDGPULowerKernelArguments : public FunctionPass {
@@ -282,6 +393,15 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
+  if (InPreloadSequence) {
+    // Alignment for first implicit arg is 4 from hidden_block_count_x.
+    const unsigned FirstImplicitArgAlignment = 4;
+    uint64_t ImplicitArgsBaseOffset =
+        alignTo(ExplicitArgOffset, Align(FirstImplicitArgAlignment)) +
+        BaseOffset;
+    PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, Builder);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..2765df6bc7fdae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2444,8 +2444,8 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
-// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
-// sequential starting from the first argument.
+// Allocate pre-loaded kernel arguments. Preloaded arguments must be
+// sequential and preloading must also start from the first argument.
 void SITargetLowering::allocatePreloadKernArgSGPRs(
     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
@@ -2456,6 +2456,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
   bool InPreloadSequence = true;
   unsigned InIdx = 0;
+  const Align KernelArgBaseAlign = Align(16);
   for (auto &Arg : F.args()) {
     if (!InPreloadSequence || !Arg.hasInRegAttr())
       break;
@@ -2472,7 +2473,6 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
          InIdx++) {
       assert(ArgLocs[ArgIdx].isMemLoc());
       auto &ArgLoc = ArgLocs[InIdx];
-      const Align KernelArgBaseAlign = Align(16);
       unsigned ArgOffset = ArgLoc.getLocMemOffset();
       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
       unsigned NumAllocSGPRs =
@@ -2511,6 +2511,88 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
     }
   }
+
+  if (Info.hasWorkGroupSizeX() || Info.hasWorkGroupSizeY() ||
+      Info.hasWorkGroupSizeZ()) {
+    unsigned ImplicitArgsBaseOffset = 0;
+    unsigned ImplictArgsBaseIdx = MF.getFunction().arg_size();
+    for (auto &Arg : MF.getFunction().args()) {
+      Type *Ty;
+      MaybeAlign Align;
+      if (Arg.hasByRefAttr()) {
+        Ty = Arg.getParamByRefType();
+        Align = Arg.getParamAlign();
+      } else {
+        Ty = Arg.getType();
+        Align = MF.getDataLayout().getABITypeAlign(Ty);
+      }
+      auto Size = MF.getDataLayout().getTypeAllocSize(Ty);
+      ImplicitArgsBaseOffset = alignTo(ImplicitArgsBaseOffset, *Align);
+      ImplicitArgsBaseOffset += Size;
+    }
+    unsigned ImplicitArgBaseSGPROffset = alignTo(ImplicitArgsBaseOffset, 4) / 4;
+    assert(ImplicitArgBaseSGPROffset <
+           AMDGPU::getMaxNumUserSGPRs(MF.getSubtarget()));
+    Info.allocateUserSGPRs(ImplicitArgBaseSGPROffset);
+
+    unsigned AllocatedSGPRs = ImplicitArgBaseSGPROffset;
+    // FIXME: Create helper/refactor.
+    if (Info.hasWorkGroupSizeX()) {
+      unsigned Offset = ImplicitArgsBaseOffset + 12;
+      unsigned ImplictArgIdx = ImplictArgsBaseIdx + 3;
+      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+      unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+      if (Alignment < 4)
+        Padding -= 1;
+      // Byte offset for data in preload SGPRs.
+      unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+      SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+          TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+      MCRegister Reg = PreloadRegs->front();
+      assert(Reg);
+      MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
+
+    if (Info.hasWorkGroupSizeY()) {
+      unsigned Offset = ImplicitArgsBaseOffset + 14;
+      unsigned ImplictArgIdx = ImplictArgsBaseIdx + 4;
+      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+      unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+      if (Alignment < 4)
+        Padding -= 1;
+
+      // Byte offset for data in preload SGPRs.
+      unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+      SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+          TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+      MCRegister Reg = PreloadRegs->front();
+      assert(Reg);
+      MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
+
+    if (Info.hasWorkGroupSizeZ()) {
+      unsigned Offset = ImplicitArgsBaseOffset + 16;
+      unsigned ImplictArgIdx = ImplictArgsBaseIdx + 5;
+      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+      unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+      if (Alignment < 4)
+        Padding -= 1;
+
+      // Byte offset for data in preload SGPRs.
+      unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+      SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+          TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+      MCRegister Reg = PreloadRegs->front();
+      assert(Reg);
+      MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
+  }
 }
 
 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
@@ -8325,6 +8407,45 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                         Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
                         IndexKeyi32, Op.getOperand(7)});
   }
+  case Intrinsic::amdgcn_workgroup_size_x:
+  case Intrinsic::amdgcn_workgroup_size_y:
+  case Intrinsic::amdgcn_workgroup_size_z: {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    assert(ST.hasKernargPreload());
+    SDLoc DL(Op);
+    unsigned ImplictArgsBaseIdx = MF.getFunction().arg_size();
+    unsigned ImplictArgIdx = ImplictArgsBaseIdx;
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_workgroup_size_x:
+      ImplictArgIdx = ImplictArgsBaseIdx + 3;
+      break;
+    case Intrinsic::amdgcn_workgroup_size_y:
+      ImplictArgIdx = ImplictArgsBaseIdx + 4;
+      break;
+    case Intrinsic::amdgcn_workgroup_size_z:
+      ImplictArgIdx = ImplictArgsBaseIdx + 5;
+      break;
+    }
+
+    auto &ArgInfo = MFI->getArgInfo()
+                       .PreloadKernArgs.find(ImplictArgIdx)
+                       ->getSecond();
+    Register Reg = ArgInfo.Regs[0];
+    unsigned ByteOffset = ArgInfo.ByteOffset;
+    Register VReg = MRI.getLiveInVirtReg(Reg);
+    SDValue Preload =
+        DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i32);
+    if (ByteOffset == 0) {
+      Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Preload);
+    } else {
+      SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+      SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Preload, ShiftAmt);
+
+      Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Extract);
+    }
+    return Preload;
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 52d6fe6c7ba51c..0a85af0f5ac1c3 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,6 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
+      WorkGroupSizeX(false), WorkGroupSizeY(false), WorkGroupSizeZ(false),
       GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -58,6 +59,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   if (IsKernel) {
     WorkGroupIDX = true;
     WorkItemIDX = true;
+    if (F.hasFnAttribute("amdgpu-preload-work-group-size-x"))
+      WorkGroupSizeX = true;
+
+    if (F.hasFnAttribute("amdgpu-preload-work-group-size-y"))
+      WorkGroupSizeY = true;
+
+    if (F.hasFnAttribute("amdgpu-preload-work-group-size-z"))
+      WorkGroupSizeZ = true;
+
   } else if (CC == CallingConv::AMDGPU_PS) {
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
@@ -245,7 +255,8 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
 
 SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
     const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+    unsigned ByteOffset) {
   assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
          "Preload kernel argument allocated twice.");
   NumUserSGPRs += PaddingSGPRs;
@@ -254,6 +265,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   // merge them.
   Register PreloadReg =
       TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+  ArgInfo.PreloadKernArgs[KernArgIdx].ByteOffset = ByteOffset;
   if (PreloadReg &&
       (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
     ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
@@ -270,6 +282,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
 }
 
+bool SIMachineFunctionInfo::allocateUserSGPRs(
+    unsigned Number) {
+  if (Number <= getNumUserSGPRs())
+    return false;
+
+  NumUserSGPRs = Number;
+  return true;
+}
+
 void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
                                              uint64_t Size, Align Alignment) {
   // Skip if it is an entry function or the register is already added.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea74..64dc7e78a94186 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -458,6 +458,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // user arguments. This is an offset from the KernargSegmentPtr.
   bool ImplicitArgPtr : 1;
 
+  bool WorkGroupSizeX : 1;
+  bool WorkGroupSizeY : 1;
+  bool WorkGroupSizeZ : 1;
+
   bool MayNeedAGPRs : 1;
 
   // The hard-wired high half of the address of the global information table
@@ -740,8 +744,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   Register addLDSKernelId();
   SmallVectorImpl<MCRegister> *
   addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-                      unsigned AllocSizeDWord, int KernArgIdx,
-                      int PaddingSGPRs);
+                      unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+                      unsigned Offset = 0);
+
+  /// Reserve up to \p Number of user SGPRs.
+  bool allocateUserSGPRs(unsigned Number);
 
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
@@ -837,6 +844,18 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return ImplicitArgPtr;
   }
 
+  bool hasWorkGroupSizeX() const {
+    return WorkGroupSizeX;
+  }
+
+  bool hasWorkGroupSizeY() const {
+    return WorkGroupSizeY;
+  }
+
+  bool hasWorkGroupSizeZ() const {
+    return WorkGroupSizeZ;
+  }
+
   AMDGPUFunctionArgI...
[truncated]

github-actions · 2024-03-04T10:15:43Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 0d493ed2c6e664849a979b357a606dcd8273b03f d8081e767a911d0abf09c703ba98941011aecd58 -- llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.h llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

View the diff from clang-format here.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 2434c7a1f0..ca45ecfee8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -76,9 +76,7 @@ public:
     return StackOffset;
   }
 
-  void setMask(unsigned Mask) {
-    this->Mask = Mask;
-  }
+  void setMask(unsigned Mask) { this->Mask = Mask; }
 
   unsigned getMask() const {
     return Mask;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 62afe7daaa..b6260ac871 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2465,8 +2465,7 @@ void SITargetLowering::allocatePreloadImplicitKernArgSGPRs(
     unsigned Padding = alignTo(ArgOffset, 4) / 4 - AllocatedSGPRs;
     if (Padding == 0 && Alignment < 4) {
       // Argument is preloaded into the previous SGPR.
-      auto &KernargPreloadInfo =
-          Info.getArgInfo().PreloadKernArgs[ArgOffset];
+      auto &KernargPreloadInfo = Info.getArgInfo().PreloadKernArgs[ArgOffset];
       KernargPreloadInfo.Regs.push_back(
           Info.getArgInfo().PreloadKernArgs[LastArgOffset].Regs[0]);
       KernargPreloadInfo.setMask(0xffff0000u);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c218999c9e..1e82c51632 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -585,10 +585,9 @@ public:
     const SIRegisterInfo &TRI,
     SIMachineFunctionInfo &Info) const;
 
-  void allocatePreloadImplicitKernArgSGPRs(CCState &CCInfo,
-                                        MachineFunction &MF,
-                                        const SIRegisterInfo &TRI,
-                                        SIMachineFunctionInfo &Info) const;
+  void allocatePreloadImplicitKernArgSGPRs(CCState &CCInfo, MachineFunction &MF,
+                                           const SIRegisterInfo &TRI,
+                                           SIMachineFunctionInfo &Info) const;
 
   void allocateSpecialInputVGPRs(CCState &CCInfo,
                                  MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index d57b71bfc0..62cd8df718 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -252,8 +252,7 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
 
 SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
     const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
-    unsigned Mask) {
+    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs, unsigned Mask) {
   assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
          "Preload kernel argument allocated twice.");
   NumUserSGPRs += PaddingSGPRs;
@@ -279,8 +278,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
 }
 
-bool SIMachineFunctionInfo::allocateUserSGPRs(
-    unsigned Number) {
+bool SIMachineFunctionInfo::allocateUserSGPRs(unsigned Number) {
   if (Number <= getNumUserSGPRs())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f9dca67a8b..947d4268bf 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -862,9 +862,7 @@ public:
     return ImplicitArgPtr;
   }
 
-  bool hasPreloadImplicitArgs() const {
-    return PreloadImplicitArgs;
-  }
+  bool hasPreloadImplicitArgs() const { return PreloadImplicitArgs; }
 
   AMDGPUFunctionArgInfo &getArgInfo() {
     return ArgInfo;

arsenm

How is this ultimately getting encoded? Do we need the special casing, or can we just identify ranges of accessed implicit arg bytes?

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

arsenm · 2024-04-26T12:28:52Z

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+    if (HasPreloadImplicitArgs)
+      F.addFnAttr("amdgpu-preload-implicitargs");


Can you avoid adding this attribute?

This was added to avoid searching for these intrinsic every time when allocating user SGPRs, which happens right before isel. It could be avoided if we either add the intrinsics later (when allocating user SGPRs) or if I just get rid of the intrinsic entirely. I commented on the later option below.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

arsenm · 2024-04-26T12:32:42Z

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+  // A unique identifier defined as the offset from start of implicit args added
+  // to the number of formal args.
+  unsigned ImplictArgIdx =
+      MI.getOperand(2).getImm() + MF.getFunction().arg_size();
+  auto &ArgDesc =
+      MFI->getArgInfo().PreloadKernArgs.find(ImplictArgIdx)->getSecond();


I think representing this in terms of indexes of implicit arguments will be confusing. The arguments do not have uniform size. Can this be expressed as a raw byte offset instead?

Here, it is a byte offset from the start of the implicit arg ptr plus the index of the last explicit argument so that there are no overlaps. I've changed it to be the raw offset into the kernarg segment.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

arsenm · 2024-04-26T12:34:17Z

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+      if (!U->hasOneUse())
+        continue;


This will usually have multiple uses, so I don't understand this. It's expected to appear with multiple GEP users

U is either a GEP with one user which should be the load of the implicit argument, or U is a load that uses the implicitarg ptr directly. This is the same logic that is in AMDGPULowerKernelAttributes, but come to think of it maybe we should avoid the check for a single user in both places because in the case where we directly use the implicitarg ptr that load may have multiple uses. I guess it's only relevant for block_count_x though (offset 0).

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

arsenm · 2024-04-26T12:36:53Z

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+// This intrinsic is used to track the user SGPRs that hold implicit kernel
+// arguments. The i32 argument for this intrinsic represents the offset in
+// bytes from the value pointed to by the implicitarg pointer.
+def int_amdgcn_preload_implicitarg :


Fundamentally implicit arguments aren't different from a user argument. Why do these require a special intrinsic to track them if we already handle the user case? If we do need this intrinsic, I think in terms of patch splitting it would be better to introduce this separately with a dedicated test showing it works for the full range of legal types

I'd like to avoid needing an intrinsic. The issue is how do I identify the loads that use the implicit_arg pointer to properly allocate user SGPRs before isel, and then also identify and replace these loads during isel. The best idea I've come up with is to check ahead of time and use an intrinsic.

We allocate user SGPRs, then system SGPRs directly after. If it is possible to delay the allocation of system SGPRs until the end of isel then it's doable. What do you think?

This flow sounds backwards. You shouldn't need to identify the loads, they shouldn't appear in CodeGen in the first place. I would expect the IR pass marking the inreg arguments to replace the load uses with the direct argument use. The argument lowering would then just emit the CopyFromReg from the correct position for the inreg cases (instead of the legacy emit load lowering we have now).

In an ideal future, all the kernel arguments are byref pointer arguments, and only inreg user SGPR eligible arguments are in the kernel signature (excluding the byref pointers)

Am I understanding correctly that you think implicit hidden arguments should be added to the function signature in some way? Or what do you mean by "IR pass marking the inreg arguments to replace the load uses with the direct argument use". Maybe that should just be done in the frontend instead of just an IR pass since that is where this lowering to loads is currently happening. I mean it makes sense to me that they should be actual arguments but I'm not sure about the consequences of doing that.

Gentle ping. Could you expand on what you mean here?

I forgot this was specifically about the implicit arguments, but essentially yes. You could rewrite the IR signature of the function, giving you uniformity of representation, although it's a bit cumbersome to do that (you have to create a new function with the new type, then splice the body, and steal the name)

I have a long term desire to have clang only emit byref for every kernel argument, and avoid the triple handling in AMDPULowerKernelArguments and the 2 selector implementations. We would then have a uniformity of representation where non-byref arguments are passed in registers. The blocker before is you lose some optimizations, from attributes that traditionally can only be applied to arguments. The main one was noalias, but we have metadata now that works in the load case, so it's probably a reasonable time to revisit that project.

I believe rewriting the function signature in this way would be an ABI breaking change that would require coordination beyond the compiler.

Do you think that the current approach in this PR could be feasible in the meantime, or do you think that changing the handling of implicit arguments in this way is something we should move towards right away?

Implicit arguments may be preloaded into User SGPRs via the same mechanism as explicit arguments if their offsets within the kernarg segment fall within the range of available registers. Lowering of these implicit arguments may happen early so the implementation here follows the same concept and is mostly agnostic to which values are being loaded, and instead only cares about offsets from the implicitarg pointer and the size of the values being used. Unlike preloading of explicit arguments there are not restrictions on exactly which implicit arguments are used and whether there is a unbroken sequence of used arguments, but instead this will attempt to preload anything that falls within the range of available User SGPRs on the target HW. A limitation of this patch is that it only supports i16/i32 arguments, but like other details of preloading kernargs for both implicit and explicit arguments this is likely to be expanded and changed in the near future.

llvmbot added backend:AMDGPU llvm:ir labels Mar 4, 2024

arsenm reviewed Mar 4, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Outdated Show resolved Hide resolved

llvm/include/llvm/IR/IntrinsicsAMDGPU.td Outdated Show resolved Hide resolved

kerbowa force-pushed the preload-implict-kernargs-groupsize branch from f2e3ef0 to 52595ae Compare March 18, 2024 08:01

kerbowa changed the title ~~[AMDGPU] Add support for preloading hidden groupsize args~~ [AMDGPU] Add support for preloading implicit kernel arguments Mar 18, 2024

arsenm reviewed Mar 18, 2024

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAMDGPU.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Show resolved Hide resolved

kerbowa force-pushed the preload-implict-kernargs-groupsize branch from 52595ae to eaf3951 Compare April 1, 2024 06:40

llvmbot added the llvm:globalisel label Apr 1, 2024

kerbowa requested review from bcahoon and rampitec April 8, 2024 15:06

kerbowa requested a review from arsenm April 22, 2024 15:02

arsenm reviewed Apr 26, 2024

View reviewed changes

kerbowa force-pushed the preload-implict-kernargs-groupsize branch from eaf3951 to d8081e7 Compare May 6, 2024 05:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Add support for preloading implicit kernel arguments #83817

[AMDGPU] Add support for preloading implicit kernel arguments #83817

kerbowa commented Mar 4, 2024 •

edited

llvmbot commented Mar 4, 2024 •

edited

github-actions bot commented Mar 4, 2024 •

edited

arsenm left a comment

arsenm Apr 26, 2024

kerbowa May 6, 2024

arsenm Apr 26, 2024

kerbowa May 6, 2024

arsenm Apr 26, 2024

kerbowa May 6, 2024

arsenm Apr 26, 2024 •

edited

kerbowa May 6, 2024

arsenm May 8, 2024 •

edited

kerbowa May 13, 2024

kerbowa May 19, 2024

arsenm May 20, 2024

kerbowa Jun 3, 2024

		if (HasPreloadImplicitArgs)
		F.addFnAttr("amdgpu-preload-implicitargs");

[AMDGPU] Add support for preloading implicit kernel arguments #83817

Are you sure you want to change the base?

[AMDGPU] Add support for preloading implicit kernel arguments #83817

Conversation

kerbowa commented Mar 4, 2024 • edited

llvmbot commented Mar 4, 2024 • edited

github-actions bot commented Mar 4, 2024 • edited

arsenm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

arsenm Apr 26, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

arsenm May 8, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kerbowa commented Mar 4, 2024 •

edited

llvmbot commented Mar 4, 2024 •

edited

github-actions bot commented Mar 4, 2024 •

edited

arsenm Apr 26, 2024 •

edited

arsenm May 8, 2024 •

edited