diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 2d8f259007c66..93f9c7d7fb176 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -955,6 +955,7 @@ class AMDGPULowerModuleLDS { Module &M, LDSUsesInfoTy &LDSUsesInfo, VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { bool Changed = false; + const DataLayout &DL = M.getDataLayout(); // The 1st round: give module-absolute assignments int NumAbsolutes = 0; std::vector OrderedGVs; @@ -976,8 +977,11 @@ class AMDGPULowerModuleLDS { } OrderedGVs = sortByName(std::move(OrderedGVs)); for (GlobalVariable *GV : OrderedGVs) { - int BarId = ++NumAbsolutes; unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + NumAbsolutes += BarCnt; + // 4 bits for alignment, 5 bits for the barrier num, // 3 bits for the barrier scope unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; @@ -1015,12 +1019,11 @@ class AMDGPULowerModuleLDS { // create a new GV used only by this kernel and its function. auto NewGV = uniquifyGVPerKernel(M, GV, F); Changed |= (NewGV != GV); - int BarId = (NumAbsolutes + 1); - if (Kernel2BarId.contains(F)) { - BarId = (Kernel2BarId[F] + 1); - } - Kernel2BarId[F] = BarId; unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + Kernel2BarId[F] += BarCnt; unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; recordLDSAbsoluteAddress(&M, NewGV, Offset); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index dfe7c53aaca06..5776d14a3020a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -31,28 +31,40 @@ Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { GV->getValueType()); } -TargetExtType *isNamedBarrier(const GlobalVariable &GV) { - // TODO: Allow arrays and structs, if all members are barriers - // in the same scope. - // TODO: Disallow other uses of target("amdgcn.named.barrier") including: - // - Structs containing barriers in different scope. - // - Structs containing a mixture of barriers and other data. - // - Globals in other address spaces. - // - Allocas. +// Returns the target extension type of a global variable, +// which can only be a TargetExtType, an array or single-element struct of it, +// or their nesting combination. +// TODO: allow struct of multiple TargetExtType elements of the same type. +// TODO: Disallow other uses of target("amdgcn.named.barrier") including: +// - Structs containing barriers in different scope/rank +// - Structs containing a mixture of barriers and other data. +// - Globals in other address spaces. +// - Allocas. +static TargetExtType *getTargetExtType(const GlobalVariable &GV) { Type *Ty = GV.getValueType(); while (true) { if (auto *TTy = dyn_cast(Ty)) - return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr; + return TTy; if (auto *STy = dyn_cast(Ty)) { - if (STy->getNumElements() == 0) + if (STy->getNumElements() != 1) return nullptr; Ty = STy->getElementType(0); continue; } + if (auto *ATy = dyn_cast(Ty)) { + Ty = ATy->getElementType(); + continue; + } return nullptr; } } +TargetExtType *isNamedBarrier(const GlobalVariable &GV) { + if (TargetExtType *Ty = getTargetExtType(GV)) + return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr; + return nullptr; +} + bool isDynamicLDS(const GlobalVariable &GV) { // external zero size addrspace(3) without initializer is dynlds. const Module *M = GV.getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 0804a52ba536d..03a666fbe3aea 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,16 +1,18 @@ ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s -@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } + +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison -@bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison -; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0 +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0 ; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1 -; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2 -; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2 +; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 +; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 -; SOUT: .set func1.num_named_barrier, 3 +; SOUT: .set func1.num_named_barrier, 7 define void @func1() { call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) @@ -18,7 +20,7 @@ define void @func1() { ret void } -; SOUT: .set func2.num_named_barrier, 1 +; SOUT: .set func2.num_named_barrier, 2 define void @func2() { call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) @@ -26,8 +28,8 @@ define void @func2() { ret void } -; SOUT: .amdhsa_named_barrier_count 1 -; SOUT: .set kernel1.num_named_barrier, max(2, func1.num_named_barrier, func2.num_named_barrier) +; SOUT: .amdhsa_named_barrier_count 2 +; SOUT: .set kernel1.num_named_barrier, max(6, func1.num_named_barrier, func2.num_named_barrier) define amdgpu_kernel void @kernel1() #0 { ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) @@ -40,8 +42,8 @@ define amdgpu_kernel void @kernel1() #0 { ret void } -; SOUT: .amdhsa_named_barrier_count 1 -; SOUT: .set kernel2.num_named_barrier, max(2, func2.num_named_barrier) +; SOUT: .amdhsa_named_barrier_count 2 +; SOUT: .set kernel2.num_named_barrier, max(6, func2.num_named_barrier) define amdgpu_kernel void @kernel2() #0 { ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) @@ -68,5 +70,5 @@ attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } ; CHECK: !0 = !{i32 8396816, i32 8396817} -; CHECK-NEXT: !1 = !{i32 8396848, i32 8396849} -; CHECK-NEXT: !2 = !{i32 8396832, i32 8396833} +; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913} +; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}