AMDGPU: Fix allocating GDS globals to LDS offsets

These don't seem to be very well used or tested, but try to make the behavior a bit more consistent with LDS globals. I'm not sure what the definition for amdgpu-gds-size is supposed to mean. For now I assumed it's allocating a static size at the beginning of the allocation, and any known globals are allocated after it.
llvm · Apr 20, 2022 · b5ec131 · b5ec131
1 parent 378bb80
commit b5ec131
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 23 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -32,6 +32,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
   Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
   WaveLimiter = WaveLimitAttr.getValueAsBool();
 
+  // FIXME: How is this attribute supposed to interact with statically known
+  // global sizes?
+  StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, GDSSize);
+
+  // Assume the attribute allocates before any known GDS globals.
+  StaticGDSSize = GDSSize;
+
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -46,18 +55,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   Align Alignment =
       DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
-  /// TODO: We should sort these to minimize wasted space due to alignment
-  /// padding. Currently the padding is decided by the first encountered use
-  /// during lowering.
-  unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+  unsigned Offset;
+  if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    /// TODO: We should sort these to minimize wasted space due to alignment
+    /// padding. Currently the padding is decided by the first encountered use
+    /// during lowering.
+    Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
 
-  Entry.first->second = Offset;
-  StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+    StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
 
-  // Update the LDS size considering the padding to align the dynamic shared
-  // memory.
-  LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+    // Update the LDS size considering the padding to align the dynamic shared
+    // memory.
+    LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+  } else {
+    Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
+    StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
 
+    // FIXME: Apply alignment of dynamic GDS
+    GDSSize = StaticGDSSize;
+  }
+
+  Entry.first->second = Offset;
   return Offset;
 }
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -12,6 +12,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "AMDGPU.h"
 
 namespace llvm {
 
@@ -25,11 +26,13 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   Align MaxKernArgAlign;        // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
-  unsigned LDSSize = 0;
+  uint32_t LDSSize = 0;
+  uint32_t GDSSize = 0;
 
   /// Number of bytes in the LDS allocated statically. This field is only used
   /// in the instruction selector and not part of the machine function info.
-  unsigned StaticLDSSize = 0;
+  uint32_t StaticLDSSize = 0;
+  uint32_t StaticGDSSize = 0;
 
   /// Align for dynamic shared memory if any. Dynamic shared memory is
   /// allocated directly after the static one, i.e., LDSSize. Need to pad
@@ -69,6 +72,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return LDSSize;
   }
 
+  uint32_t getGDSSize() const {
+    return GDSSize;
+  }
+
   AMDGPU::SIModeRegisterDefaults getMode() const {
     return Mode;
   }

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,8 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ImplicitBufferPtr(false),
     ImplicitArgPtr(false),
     GITPtrHigh(0xffffffff),
-    HighBitsOf32BitAddress(0),
-    GDSSize(0) {
+    HighBitsOf32BitAddress(0) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -184,10 +183,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (!S.empty())
     S.consumeInteger(0, HighBitsOf32BitAddress);
 
-  S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
-  if (!S.empty())
-    S.consumeInteger(0, GDSSize);
-
   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
   // VGPR available at all times.
   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -437,7 +437,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned GITPtrHigh;
 
   unsigned HighBitsOf32BitAddress;
-  unsigned GDSSize;
 
   // Current recorded maximum possible occupancy.
   unsigned Occupancy;
@@ -748,10 +747,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     return HighBitsOf32BitAddress;
   }
 
-  unsigned getGDSSize() const {
-    return GDSSize;
-  }
-
   unsigned getNumUserSGPRs() const {
     return NumUserSGPRs;
   }

diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@gds0 = internal addrspace(2) global [4 x i32] undef, align 4
+@lds0 = internal addrspace(3) global [4 x i32] undef, align 128
+@lds1 = internal addrspace(3) global [4 x i32] undef, align 256
+
+; These two objects should be allocated at the same constant offsets
+; from the base.
+define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+  %gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel
+  ret void
+}
+
+; The LDS alignment shouldn't change offset of GDS.
+define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:140
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+
+  %gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel
+
+  %gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3
+  %val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel
+  ret void
+}
+
+@gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8
+@gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32
+
+define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
+; GCN-LABEL: gds_global_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 32
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:28 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 {
+; GCN-LABEL: gds_global_align_plus_attr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_movk_i32 m0, 0x420
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:1052 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:1036 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  ret void
+}
+
+@small.gds = internal addrspace(2) global i8 undef, align 1
+@gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4
+
+define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 {
+; GCN-LABEL: gds_extern_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    s_movk_i32 m0, 0x401
+; GCN-NEXT:    s_movk_i32 s1, 0x400
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds)
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-gds-size"="1024" }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: explicitKernArgSize: 0
 ; CHECK-NEXT: maxKernArgAlign: 4
 ; CHECK-NEXT: ldsSize: 0
-; CHECK-NEXT: gdsSize: 0
+; CHECK-NEXT: gdsSize: 512
 ; CHECK-NEXT: dynLDSAlign: 1
 ; CHECK-NEXT: isEntryFunction: true
 ; CHECK-NEXT: noSignedZerosFPMath: false
@@ -80,6 +80,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: occupancy: 10
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
+  %gep = getelementptr inbounds [128 x i32], [128 x i32] addrspace(2)* @gds, i32 0, i32 %arg0
+  atomicrmw add i32 addrspace(2)* %gep, i32 8 seq_cst
   ret void
 }