llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPU.h‎
Lines changed: 9 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPU.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp‎
Lines changed: 240 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp‎
Lines changed: 240 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp‎
Lines changed: 0 additions & 126 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp‎
Lines changed: 0 additions & 126 deletions
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
   bool GlobalOpt;
 };
 
+void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
+extern char &AMDGPULowerExecSyncLegacyPassID;
+ModulePass *createAMDGPULowerExecSyncLegacyPass();
+
+struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
+  AMDGPULowerExecSyncPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
 extern char &AMDGPUSwLowerLDSLegacyPassID;
 ModulePass *
 
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                           Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernelLDS(F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+  sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerExecSyncGlobalVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  SmallVector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  SmallVector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
+static bool runLowerExecSyncGlobals(Module &M) {
+  CallGraph CG = CallGraph(M);
+  bool Changed = false;
+  Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+  // For each kernel, what variables does it access directly or through
+  // callees
+  LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+  // For each variable accessed through callees, which kernels access it
+  VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    for (GlobalVariable *GV : K.second) {
+      LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+    }
+  }
+
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // Special LDS variables need special address assignment
+    Changed |= lowerExecSyncGlobalVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+  return Changed;
+}
+
+class AMDGPULowerExecSyncLegacy : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+                      "AMDGPU lowering of execution synchronization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+                    "AMDGPU lowering of execution synchronization", false,
+                    false)
+
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+  return runLowerExecSyncGlobals(M);
+}
+
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+  return new AMDGPULowerExecSyncLegacy();
+}
+
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+                                    : PreservedAnalyses::all();
+}
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
     return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
-                                             Function *KF) {
-    bool NeedsReplacement = false;
-    for (Use &U : GV->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (isKernelLDS(F) && F != KF) {
-          NeedsReplacement = true;
-          break;
-        }
-      }
-    }
-    if (!NeedsReplacement)
-      return GV;
-    // Create a new GV used only by this kernel and its function
-    GlobalVariable *NewGV = new GlobalVariable(
-        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-    NewGV->copyAttributesFrom(GV);
-    for (Use &U : make_early_inc_range(GV->uses())) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (!isKernelLDS(F) || F == KF) {
-          U.getUser()->replaceUsesOfWith(GV, NewGV);
-        }
-      }
-    }
-    return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-      Module &M, LDSUsesInfoTy &LDSUsesInfo,
-      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-    bool Changed = false;
-    const DataLayout &DL = M.getDataLayout();
-    // The 1st round: give module-absolute assignments
-    int NumAbsolutes = 0;
-    std::vector<GlobalVariable *> OrderedGVs;
-    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-      GlobalVariable *GV = K.first;
-      if (!isNamedBarrier(*GV))
-        continue;
-      // give a module-absolute assignment if it is indirectly accessed by
-      // multiple kernels. This is not precise, but we don't want to duplicate
-      // a function when it is called by multiple kernels.
-      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-        OrderedGVs.push_back(GV);
-      } else {
-        // leave it to the 2nd round, which will give a kernel-relative
-        // assignment if it is only indirectly accessed by one kernel
-        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-      }
-      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-    }
-    OrderedGVs = sortByName(std::move(OrderedGVs));
-    for (GlobalVariable *GV : OrderedGVs) {
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-      unsigned BarId = NumAbsolutes + 1;
-      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-      NumAbsolutes += BarCnt;
-
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-      recordLDSAbsoluteAddress(&M, GV, Offset);
-    }
-    OrderedGVs.clear();
-
-    // The 2nd round: give a kernel-relative assignment for GV that
-    // either only indirectly accessed by single kernel or only directly
-    // accessed by multiple kernels.
-    std::vector<Function *> OrderedKernels;
-    for (auto &K : LDSUsesInfo.direct_access) {
-      Function *F = K.first;
-      assert(isKernelLDS(F));
-      OrderedKernels.push_back(F);
-    }
-    OrderedKernels = sortByName(std::move(OrderedKernels));
-
-    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
-    for (Function *F : OrderedKernels) {
-      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-        if (!isNamedBarrier(*GV))
-          continue;
-
-        LDSUsesInfo.direct_access[F].erase(GV);
-        if (GV->isAbsoluteSymbolRef()) {
-          // already assigned
-          continue;
-        }
-        OrderedGVs.push_back(GV);
-      }
-      OrderedGVs = sortByName(std::move(OrderedGVs));
-      for (GlobalVariable *GV : OrderedGVs) {
-        // GV could also be used directly by other kernels. If so, we need to
-        // create a new GV used only by this kernel and its function.
-        auto NewGV = uniquifyGVPerKernel(M, GV, F);
-        Changed |= (NewGV != GV);
-        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-        unsigned BarId = Kernel2BarId[F];
-        BarId += NumAbsolutes + 1;
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-        Kernel2BarId[F] += BarCnt;
-        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-        recordLDSAbsoluteAddress(&M, NewGV, Offset);
-      }
-      OrderedGVs.clear();
-    }
-    // Also erase those special LDS variables from indirect_access.
-    for (auto &K : LDSUsesInfo.indirect_access) {
-      assert(isKernelLDS(K.first));
-      for (GlobalVariable *GV : K.second) {
-        if (isNamedBarrier(*GV))
-          K.second.erase(GV);
-      }
-    }
-    return Changed;
-  }
-
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS {
       }
     }
 
-    if (LDSUsesInfo.HasSpecialGVs) {
-      // Special LDS variables need special address assignment
-      Changed |= lowerSpecialLDSVariables(
-          M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
-    }
-
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;