Skip to content

Conversation

macurtis-amd
Copy link
Contributor

@macurtis-amd macurtis-amd commented Oct 9, 2025

From Sam Liu:

CUDA supports thread block clusters https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters

In their atomic intrinsics, cluster scope is supported https://docs.nvidia.com/cuda/cuda-c-programming-guide/#nv-atomic-fetch-add-and-nv-atomic-add

For compatibility, clang and hip needs to support cluster scope.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen IR generation bugs: mangling, exceptions, etc. labels Oct 9, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 9, 2025

@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-backend-amdgpu

Author: None (macurtis-amd)

Changes

Patch is 400.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162575.diff

15 Files Affected:

  • (modified) clang/docs/HIPSupport.rst (+2)
  • (modified) clang/docs/LanguageExtensions.rst (+1)
  • (modified) clang/include/clang/Basic/SyncScope.h (+32-10)
  • (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+15-14)
  • (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (+4)
  • (modified) clang/lib/CodeGen/Targets/SPIR.cpp (+2)
  • (modified) clang/lib/Frontend/InitPreprocessor.cpp (+2)
  • (modified) clang/test/CodeGen/scoped-atomic-ops.c (+4293-190)
  • (modified) clang/test/CodeGen/scoped-fence-ops.c (+28-13)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl (+41-18)
  • (modified) clang/test/Preprocessor/init-aarch64.c (+2)
  • (modified) clang/test/Preprocessor/init-loongarch.c (+12-10)
  • (modified) clang/test/Preprocessor/init.c (+11-6)
  • (modified) clang/test/SemaCUDA/atomic-ops.cu (+7-2)
  • (modified) clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu (+7-2)
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index b4a671e3cfa3c..ec2af2a6f569d 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -164,6 +164,8 @@ Predefined Macros
      - Represents wavefront memory scope in HIP (value is 2).
    * - ``__HIP_MEMORY_SCOPE_WORKGROUP``
      - Represents workgroup memory scope in HIP (value is 3).
+   * - ``__HIP_MEMORY_SCOPE_CLUSTER``
+     - Represents cluster memory scope in HIP (value is 6).
    * - ``__HIP_MEMORY_SCOPE_AGENT``
      - Represents agent memory scope in HIP (value is 4).
    * - ``__HIP_MEMORY_SCOPE_SYSTEM``
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 6bb99c757cd19..bef6e9c14b182 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4846,6 +4846,7 @@ currently supported:
 * ``__MEMORY_SCOPE_SYSTEM``
 * ``__MEMORY_SCOPE_DEVICE``
 * ``__MEMORY_SCOPE_WRKGRP``
+* ``__MEMORY_SCOPE_CLUSTR``
 * ``__MEMORY_SCOPE_WVFRNT``
 * ``__MEMORY_SCOPE_SINGLE``
 
diff --git a/clang/include/clang/Basic/SyncScope.h b/clang/include/clang/Basic/SyncScope.h
index 5a8d2a7dd02e5..614e5faa78696 100644
--- a/clang/include/clang/Basic/SyncScope.h
+++ b/clang/include/clang/Basic/SyncScope.h
@@ -43,11 +43,13 @@ enum class SyncScope {
   SystemScope,
   DeviceScope,
   WorkgroupScope,
+  ClusterScope,
   WavefrontScope,
   SingleScope,
   HIPSingleThread,
   HIPWavefront,
   HIPWorkgroup,
+  HIPCluster,
   HIPAgent,
   HIPSystem,
   OpenCLWorkGroup,
@@ -65,6 +67,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
     return "device_scope";
   case SyncScope::WorkgroupScope:
     return "workgroup_scope";
+  case SyncScope::ClusterScope:
+    return "cluster_scope";
   case SyncScope::WavefrontScope:
     return "wavefront_scope";
   case SyncScope::SingleScope:
@@ -75,6 +79,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
     return "hip_wavefront";
   case SyncScope::HIPWorkgroup:
     return "hip_workgroup";
+  case SyncScope::HIPCluster:
+    return "hip_cluster";
   case SyncScope::HIPAgent:
     return "hip_agent";
   case SyncScope::HIPSystem:
@@ -180,7 +186,10 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
     Workgroup = 3,
     Agent = 4,
     System = 5,
-    Last = System
+    Cluster = 6,
+    End,
+    Last = End - 1,
+    Count = Last
   };
 
   AtomicScopeHIPModel() {}
@@ -193,10 +202,14 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
       return SyncScope::HIPWavefront;
     case Workgroup:
       return SyncScope::HIPWorkgroup;
+    case Cluster:
+      return SyncScope::HIPCluster;
     case Agent:
       return SyncScope::HIPAgent;
     case System:
       return SyncScope::HIPSystem;
+    case End:
+      break;
     }
     llvm_unreachable("Invalid language sync scope value");
   }
@@ -207,11 +220,12 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
   }
 
   ArrayRef<unsigned> getRuntimeValues() const override {
-    static_assert(Last == System, "Does not include all sync scopes");
     static const unsigned Scopes[] = {
         static_cast<unsigned>(SingleThread), static_cast<unsigned>(Wavefront),
-        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Agent),
-        static_cast<unsigned>(System)};
+        static_cast<unsigned>(Workgroup),    static_cast<unsigned>(Cluster),
+        static_cast<unsigned>(System),       static_cast<unsigned>(Agent)};
+    static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+                  "Does not include all sync scopes");
     return llvm::ArrayRef(Scopes);
   }
 
@@ -223,14 +237,17 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
 /// Defines the generic atomic scope model.
 class AtomicScopeGenericModel : public AtomicScopeModel {
 public:
-  /// The enum values match predefined built-in macros __ATOMIC_SCOPE_*.
+  /// The enum values match predefined built-in macros __MEMORY_SCOPE_*.
   enum ID {
     System = 0,
     Device = 1,
     Workgroup = 2,
     Wavefront = 3,
     Single = 4,
-    Last = Single
+    Cluster = 5,
+    End,
+    Last = End - 1,
+    Count = End
   };
 
   AtomicScopeGenericModel() = default;
@@ -243,10 +260,14 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
       return SyncScope::SystemScope;
     case Workgroup:
       return SyncScope::WorkgroupScope;
+    case Cluster:
+      return SyncScope::ClusterScope;
     case Wavefront:
       return SyncScope::WavefrontScope;
     case Single:
       return SyncScope::SingleScope;
+    case End:
+      break;
     }
     llvm_unreachable("Invalid language sync scope value");
   }
@@ -256,11 +277,12 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
   }
 
   ArrayRef<unsigned> getRuntimeValues() const override {
-    static_assert(Last == Single, "Does not include all sync scopes");
     static const unsigned Scopes[] = {
-        static_cast<unsigned>(Device), static_cast<unsigned>(System),
-        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Wavefront),
-        static_cast<unsigned>(Single)};
+        static_cast<unsigned>(System),    static_cast<unsigned>(Device),
+        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Cluster),
+        static_cast<unsigned>(Wavefront), static_cast<unsigned>(Single)};
+    static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+                  "Does not include all sync scopes");
     return llvm::ArrayRef(Scopes);
   }
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 6596ec06199dc..97b5828011cd4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGBuiltin.h"
+#include "clang/Basic/SyncScope.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -242,33 +243,33 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
   }
 
   // Older builtins had an enum argument for the memory scope.
+  const char *ssn = nullptr;
   int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
   switch (scope) {
-  case 0: // __MEMORY_SCOPE_SYSTEM
+  case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
     SSID = llvm::SyncScope::System;
     break;
-  case 1: // __MEMORY_SCOPE_DEVICE
-    if (getTarget().getTriple().isSPIRV())
-      SSID = getLLVMContext().getOrInsertSyncScopeID("device");
-    else
-      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+  case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
+    ssn = getTarget().getTriple().isSPIRV() ? "device" : "agent";
     break;
-  case 2: // __MEMORY_SCOPE_WRKGRP
-    SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+  case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
+    ssn = "workgroup";
     break;
-  case 3: // __MEMORY_SCOPE_WVFRNT
-    if (getTarget().getTriple().isSPIRV())
-      SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
-    else
-      SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+  case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
+    ssn = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
+    break;
+  case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
+    ssn = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
     break;
-  case 4: // __MEMORY_SCOPE_SINGLE
+  case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
     SSID = llvm::SyncScope::SingleThread;
     break;
   default:
     SSID = llvm::SyncScope::System;
     break;
   }
+  if (ssn)
+    SSID = getLLVMContext().getOrInsertSyncScopeID(ssn);
 }
 
 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..c74a1b6098922 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -488,6 +488,10 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
   case SyncScope::WavefrontScope:
     Name = "wavefront";
     break;
+  case SyncScope::HIPCluster:
+  case SyncScope::ClusterScope:
+    Name = "cluster";
+    break;
   case SyncScope::HIPWorkgroup:
   case SyncScope::OpenCLWorkGroup:
   case SyncScope::WorkgroupScope:
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 4aa63143a66cd..fbf29186faf24 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -90,6 +90,8 @@ inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) {
   case SyncScope::OpenCLSubGroup:
   case SyncScope::WavefrontScope:
     return "subgroup";
+  case SyncScope::HIPCluster:
+  case SyncScope::ClusterScope:
   case SyncScope::HIPWorkgroup:
   case SyncScope::OpenCLWorkGroup:
   case SyncScope::WorkgroupScope:
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index b899fb9c6494a..21ab9dca8f0bd 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -616,6 +616,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HIP_MEMORY_SCOPE_WORKGROUP", "3");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_AGENT", "4");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
+    Builder.defineMacro("__HIP_MEMORY_SCOPE_CLUSTER", "6");
     if (LangOpts.HIPStdPar) {
       Builder.defineMacro("__HIPSTDPAR__");
       if (LangOpts.HIPStdParInterposeAlloc) {
@@ -904,6 +905,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   Builder.defineMacro("__MEMORY_SCOPE_WRKGRP", "2");
   Builder.defineMacro("__MEMORY_SCOPE_WVFRNT", "3");
   Builder.defineMacro("__MEMORY_SCOPE_SINGLE", "4");
+  Builder.defineMacro("__MEMORY_SCOPE_CLUSTR", "5");
 
   // Define macros for the OpenCL memory scope.
   // The values should match AtomicScopeOpenCLModel::ID enum.
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index 545a6c90892c2..c39048120a457 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -1,113 +1,772 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 
-// AMDGCN-LABEL: define hidden i32 @fi1a(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: define hidden spir_func i32 @fi1a(
-// SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP1]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP5]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP9]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP11]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP12]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP11]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP12]]
+//
+// SPIRV-LABEL: define hidden spir_func i32 @fi1a(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[V:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP1]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP5]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP9]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP11]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP12]]
+//
 int fi1a(int *i) {
   int v;
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
   return v;
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_...
[truncated]

@macurtis-amd macurtis-amd requested a review from yxsamliu October 9, 2025 00:09
@macurtis-amd
Copy link
Contributor Author

@shiltian @yxsamliu ping

@macurtis-amd macurtis-amd force-pushed the cluster-sync-scope branch 2 times, most recently from fe2e520 to 4e87322 Compare October 20, 2025 16:24
@macurtis-amd macurtis-amd merged commit 5440cfc into llvm:main Oct 21, 2025
11 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 21, 2025

LLVM Buildbot has detected a new failure on builder clang-m68k-linux-cross running on suse-gary-m68k-cross while building clang at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/27/builds/17824

Here is the relevant piece of the build log for the reference
Step 5 (ninja check 1) failure: stage 1 checked (failure)
...
[120/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/CloneDetectionTest.cpp.o
[121/388] Building CXX object tools/clang/tools/extra/clangd/unittests/CMakeFiles/ClangdTests.dir/tweaks/TweakTests.cpp.o
[122/388] Building CXX object tools/clang/tools/extra/clangd/unittests/CMakeFiles/ClangdTests.dir/tweaks/TweakTesting.cpp.o
[123/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/ArenaTest.cpp.o
[124/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/FormulaTest.cpp.o
[125/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp.o
[126/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/MacroExpansionContextTest.cpp.o
[127/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/ValueTest.cpp.o
[128/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/AnalyzerOptionsTest.cpp.o
[129/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp.o
FAILED: tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp.o 
/usr/bin/c++ -DGTEST_HAS_RTTI=0 -DLLVM_BUILD_STATIC -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GLIBCXX_USE_CXX11_ABI=1 -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/tools/clang/unittests -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/include -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/tools/clang/include -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/include -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/llvm/include -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Tooling -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/third-party/unittest/googletest/include -I/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/third-party/unittest/googlemock/include -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wimplicit-fallthrough -Wno-uninitialized -Wno-nonnull -Wno-class-memaccess -Wno-dangling-reference -Wno-redundant-move -Wno-pessimizing-move -Wno-array-bounds -Wno-stringop-overread -Wno-noexcept-type -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wno-misleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -fno-common -Woverloaded-virtual -O3 -DNDEBUG -std=c++17  -Wno-variadic-macros -fno-exceptions -funwind-tables -fno-rtti -UNDEBUG -Wno-suggest-override -MD -MT tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp.o -MF tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp.o.d -o tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp.o -c /var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp
c++: fatal error: Killed signal terminated program cc1plus
compilation terminated.
[130/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/DebugSupportTest.cpp.o
[131/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/CFGMatchSwitchTest.cpp.o
[132/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/LoggerTest.cpp.o
[133/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/APSIntTypeTest.cpp.o
[134/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/BlockEntranceCallbackTest.cpp.o
[135/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MapLatticeTest.cpp.o
[136/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/SimplifyConstraintsTest.cpp.o
[137/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/DeterminismTest.cpp.o
[138/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/MatchSwitchTest.cpp.o
[139/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp.o
[140/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp.o
[141/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/TestingSupport.cpp.o
[142/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/SignAnalysisTest.cpp.o
[143/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/ChromiumCheckModelTest.cpp.o
[144/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/RecordOpsTest.cpp.o
[145/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/ASTOpsTest.cpp.o
[146/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/BugReportInterestingnessTest.cpp.o
[147/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/CallEventTest.cpp.o
[148/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/IntervalPartitionTest.cpp.o
[149/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/LifetimeSafetyTest.cpp.o
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Analysis/LifetimeSafetyTest.cpp: In instantiation of ‘bool clang::lifetimes::internal::{anonymous}::AreLiveAtImplMatcherP2<Annotation_type, ConfFilter_type>::gmock_Impl<arg_type>::MatchAndExplain(const arg_type&, testing::MatchResultListener*) const [with arg_type = const clang::lifetimes::internal::{anonymous}::OriginsInfo&; Annotation_type = const char*; ConfFilter_type = clang::lifetimes::internal::{anonymous}::LivenessKindFilter]’:
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Analysis/LifetimeSafetyTest.cpp:259:1:   required from here
  259 | MATCHER_P2(AreLiveAtImpl, Annotation, ConfFilter, "") {
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Analysis/LifetimeSafetyTest.cpp:269:19: warning: loop variable ‘<structured bindings>’ creates a copy from type ‘const std::pair<clang::lifetimes::internal::utils::ID<clang::lifetimes::internal::OriginTag>, clang::lifetimes::internal::LivenessKind>’ [-Wrange-loop-construct]
  269 |   for (const auto [OID, ActualConfidence] : ActualLiveSetOpt.value()) {
      |                   ^~~~~~~~~~~~~~~~~~~~~~~
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/unittests/Analysis/LifetimeSafetyTest.cpp:269:19: note: use reference type to prevent copying
  269 |   for (const auto [OID, ActualConfidence] : ActualLiveSetOpt.value()) {
      |                   ^~~~~~~~~~~~~~~~~~~~~~~
      |                   &
[150/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp.o
[151/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/TransferBranchTest.cpp.o
[152/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/FlowSensitive/SingleVarConstantPropagationTest.cpp.o
[153/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/StaticAnalyzer/ConflictingEvalCallsTest.cpp.o
[154/388] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Analysis/ExprMutationAnalyzerTest.cpp.o

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU clang:codegen IR generation bugs: mangling, exceptions, etc. clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants